├── .gitignore ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── aws ├── aws_launcher.py └── userdata.sh ├── bin ├── build_pipeline_heka.sh └── install_dependencies.osx.sh ├── doc └── derived_streams.md ├── examples ├── basic_local_pipeline.toml ├── decode_telemetry.toml ├── monitor_dnt.lua ├── payload_size_devel.toml ├── payload_size_devel_filter.json └── request_rates.lua ├── heka ├── cmd │ ├── heka-export │ │ └── main.go │ ├── heka-s3cat │ │ └── main.go │ ├── heka-s3list │ │ └── main.go │ └── s3cat │ │ └── main.go ├── patches │ ├── 0002-Add-cmdline-tool-for-uploading-to-S3.patch │ └── 0003-Add-more-cmds.patch ├── plugins │ ├── fx │ │ ├── common.c │ │ ├── common.h │ │ ├── executive_report.c │ │ ├── xxhash.c │ │ └── xxhash.h │ ├── hash │ │ └── lua_hash.c │ ├── kafkaconsumergroup │ │ ├── kafka_consumer_group_input.go │ │ └── kafka_consumer_group_input_test.go │ ├── s3splitfile │ │ ├── all_specs_test.go │ │ ├── s3offset_input.go │ │ ├── s3splitfile_common.go │ │ ├── s3splitfile_common_test.go │ │ ├── s3splitfile_input.go │ │ ├── s3splitfile_output.go │ │ └── testsupport │ │ │ └── schema.json │ ├── snap │ │ ├── snappy_decoder.go │ │ └── snappy_encoder.go │ └── sqs3 │ │ └── sqs3_input.go └── sandbox │ ├── decoders │ ├── decompress_payload.lua │ ├── extract_executive_summary.lua │ ├── extract_fhr_dimensions.lua │ ├── extract_telemetry_dimensions.lua │ ├── extract_tls_info.lua │ └── http_edge_decoder.lua │ ├── encoders │ ├── combine_telemetry_objects.lua │ └── es_fields.lua │ └── filters │ ├── count_by_normalized_channel.lua │ ├── dollars.lua │ ├── fhr_requests.lua │ ├── firefox_active_instances.lua │ ├── firefox_aggregator.lua │ ├── firefox_channel_switching.lua │ ├── firefox_duplicates.lua │ ├── firefox_executive_report.lua │ ├── firefox_searches.lua │ ├── firefox_usage.lua │ ├── payload_size.lua │ ├── telemetry_decoder_view.lua │ ├── telemetry_latency.lua │ ├── telemetry_requests.lua │ ├── telemetry_s3output_monitors.lua │ └── telemetry_webrtc.lua ├── hindsight ├── analysis │ └── landfill_error.lua ├── input │ ├── heka_s3.lua │ ├── serverlog_s3_bootstrap.lua │ ├── telemetry_s3_bootstrap.lua │ ├── telemetry_s3_landfill.lua │ └── telemetry_s3_snappy.lua ├── io_modules │ ├── derived_stream.lua │ ├── derived_stream │ │ ├── heka_protobuf.lua │ │ ├── redshift.lua │ │ ├── redshift │ │ │ ├── psv.lua │ │ │ └── sql.lua │ │ └── tsv.lua │ └── telemetry │ │ └── s3.lua ├── modules │ ├── agg.lua │ ├── fx.lua │ └── fx │ │ └── ping.lua └── output │ ├── cbuf2tsv.lua │ ├── crash_summary.lua │ ├── executive_summary.lua │ ├── executive_summary_full.lua │ ├── main_summary.lua │ └── telemetry_s3.lua └── reports ├── budget ├── budget.toml ├── check_targets.py ├── package.sh ├── run.sh └── schema_template.json ├── crash_stats_oom └── getting-crash-stats-for-OOM-data-to-S3.ipynb ├── derived_streams ├── derived_streams │ ├── hindsight.cfg │ ├── run │ │ ├── input │ │ │ ├── popen.lua │ │ │ ├── popen01.cfg │ │ │ ├── popen02.cfg │ │ │ ├── popen03.cfg │ │ │ ├── popen04.cfg │ │ │ ├── popen05.cfg │ │ │ ├── popen06.cfg │ │ │ ├── popen07.cfg │ │ │ ├── popen08.cfg │ │ │ ├── popen09.cfg │ │ │ ├── popen10.cfg │ │ │ ├── popen11.cfg │ │ │ ├── popen12.cfg │ │ │ ├── popen13.cfg │ │ │ ├── popen14.cfg │ │ │ ├── popen15.cfg │ │ │ ├── popen16.cfg │ │ │ ├── prune_input.cfg │ │ │ └── prune_input.lua │ │ └── output │ │ │ ├── crash_summary.lua │ │ │ ├── crash_summary01.cfg │ │ │ ├── crash_summary02.cfg │ │ │ ├── crash_summary03.cfg │ │ │ ├── crash_summary04.cfg │ │ │ ├── crash_summary05.cfg │ │ │ ├── crash_summary06.cfg │ │ │ ├── crash_summary07.cfg │ │ │ ├── crash_summary08.cfg │ │ │ ├── crash_summary09.cfg │ │ │ ├── crash_summary10.cfg │ │ │ ├── crash_summary11.cfg │ │ │ ├── crash_summary12.cfg │ │ │ ├── crash_summary13.cfg │ │ │ ├── crash_summary14.cfg │ │ │ ├── crash_summary15.cfg │ │ │ ├── crash_summary16.cfg │ │ │ ├── executive_summary01.cfg │ │ │ ├── executive_summary02.cfg │ │ │ ├── executive_summary03.cfg │ │ │ ├── executive_summary04.cfg │ │ │ ├── executive_summary05.cfg │ │ │ ├── executive_summary06.cfg │ │ │ ├── executive_summary07.cfg │ │ │ ├── executive_summary08.cfg │ │ │ ├── executive_summary09.cfg │ │ │ ├── executive_summary10.cfg │ │ │ ├── executive_summary11.cfg │ │ │ ├── executive_summary12.cfg │ │ │ ├── executive_summary13.cfg │ │ │ ├── executive_summary14.cfg │ │ │ ├── executive_summary15.cfg │ │ │ ├── executive_summary16.cfg │ │ │ ├── executive_summary_full.lua │ │ │ ├── main_summary.lua │ │ │ ├── main_summary01.cfg │ │ │ ├── main_summary02.cfg │ │ │ ├── main_summary03.cfg │ │ │ ├── main_summary04.cfg │ │ │ ├── main_summary05.cfg │ │ │ ├── main_summary06.cfg │ │ │ ├── main_summary07.cfg │ │ │ ├── main_summary08.cfg │ │ │ ├── main_summary09.cfg │ │ │ ├── main_summary10.cfg │ │ │ ├── main_summary11.cfg │ │ │ ├── main_summary12.cfg │ │ │ ├── main_summary13.cfg │ │ │ ├── main_summary14.cfg │ │ │ ├── main_summary15.cfg │ │ │ └── main_summary16.cfg │ ├── schema_template.json │ └── splitter.lua ├── hindsight │ └── bin │ │ ├── hindsight │ │ └── hindsight_cli ├── luasandbox-0.10.2-Linux-core.deb ├── package.sh ├── run.sh └── snappy.so ├── engagement_ratio ├── MauDau.ipynb └── README.txt ├── executive_summary ├── README.txt ├── package.sh ├── reformat_v4.py ├── run.sh └── run_executive_report.py ├── fennec_dashboard ├── README.txt └── summarize_csv.ipynb ├── loop ├── hindsight.cfg └── run │ ├── analysis │ ├── hll_check.lua │ ├── hll_check.off │ ├── retention.lua │ ├── retention_daily.cfg │ ├── retention_monthly.cfg │ ├── retention_weekly.cfg │ ├── xau.cfg │ └── xau.lua │ ├── input │ ├── server_logs.cfg │ └── server_logs.lua │ └── output │ └── placeholder.off ├── socorro_import └── ImportCrashData.ipynb ├── stability-summary ├── rollup.py ├── run.sh ├── summarize.py └── utils.py └── update-orphaning └── Update orphaning analysis using longitudinal dataset.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Scratch dir for building Heka 2 | build/ 3 | 4 | # Operating System gunk 5 | .DS_Store 6 | Thumbs.db 7 | 8 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 9 | *.o 10 | *.a 11 | *.so 12 | 13 | # Folders 14 | _obj 15 | _test 16 | 17 | # Architecture specific extensions/prefixes 18 | *.[568vq] 19 | [568vq].out 20 | 21 | *.cgo1.go 22 | *.cgo2.c 23 | _cgo_defun.c 24 | _cgo_gotypes.go 25 | _cgo_export.* 26 | 27 | _testmain.go 28 | 29 | *.exe 30 | *.test 31 | *.prof 32 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | go: 3 | - 1.4 4 | notifications: 5 | irc: 6 | channels: 7 | - "irc.mozilla.org#datapipeline" 8 | before_install: 9 | - sudo add-apt-repository ppa:andykimpe/cmake3 -y 10 | - sudo add-apt-repository ppa:maxmind/ppa -y 11 | - sudo apt-get update -qq 12 | - sudo apt-get install -y protobuf-compiler cmake libgeoip-dev libpq-dev 13 | 14 | install: 15 | - bash bin/build_pipeline_heka.sh 16 | 17 | script: 18 | - cd build/heka 19 | - . env.sh 20 | - cd build 21 | - make 22 | - go test github.com/mozilla-services/data-pipeline/s3splitfile 23 | - go test github.com/mozilla-services/data-pipeline/kafkaconsumergroup 24 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Community Participation Guidelines 2 | 3 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 4 | For more details, please read the 5 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 6 | 7 | ## How to Report 8 | For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. 9 | 10 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mozilla Services Data Pipeline 2 | 3 | This repository contains the extra bits and pieces needed to build heka 4 | for use in the [Cloud Services Data Pipeline](https://wiki.mozilla.org/CloudServices/DataPipeline). 5 | 6 | Visit us on irc.mozilla.org in `#datapipeline`. 7 | 8 | ## Building a Data Pipeline RPM 9 | 10 | Run `bash bin/build_pipeline_heka.sh` from the top level of this repo to build a heka RPM. 11 | 12 | ## Using the Data Pipeline 13 | 14 | If you are simply looking to test out some data analysis plugins and don't want to setup your own pipeline here is the fastest way to get going: 15 | https://mana.mozilla.org/wiki/display/CLOUDSERVICES/Using+the+sandbox+manager+in+the+prod+prototype+pipeline 16 | 17 | ## Running/Testing Your Own Data Pipeline 18 | 19 | You can set up a bare-bones data pipeline of your own. You will get an endpoint that listens for HTTP POST requests, performs GeoIP lookups, and wraps them up in protobuf messages. These messages will be relayed to a stream-processor, and will be output to a local store on disk. There will be basic web-based monitoring, and the ability to add your own stream processing filters. 20 | 21 | 1. Clone this data-pipeline github repo 22 | 23 | ``` 24 | git clone https://github.com/mozilla-services/data-pipeline.git 25 | ``` 26 | 27 | 2. Build and configure heka. If you are unable to build heka, drop by #datapipeline on irc.mozilla.org and we will try to provide you a pre-built version. 28 | 1. Make sure you have the depencies installed: 29 | 1. OpenSSL v1.0+ (required by lua_openssl) 30 | 2. libpq, the PostgreSQL API 31 | 2. Run `bash bin/build_pipeline_heka.sh` 32 | 3. Install lua modules 33 | 34 | ``` 35 | mkdir lua_modules 36 | rsync -av build/heka/build/heka/lib/luasandbox/modules/ lua_modules/ 37 | rsync -av hindsight/modules/ hindsight/io_modules/ lua_modules/ 38 | ``` 39 | 40 | 4. Procure a `GeoLiteCity.dat` file and put it in the current dir 41 | 42 | ``` 43 | wget http://geolite.maxmind.com/download/geoip/database/GeoLiteCity.dat.gz 44 | gunzip GeoLiteCity.dat.gz 45 | ``` 46 | 47 | 3. Set up the main Pipeline using the `examples/basic_local_pipeline.toml` config file. This will listen for HTTP POSTs on port 8080, log the raw and decoded messages requests to stdout, run the example filter, and output the records to a file. 48 | 49 | ``` 50 | build/heka/build/heka/bin/hekad -config examples/basic_local_pipeline.toml 51 | ``` 52 | 53 | 4. Check the monitoring dashboard at [http://localhost:4352](http://localhost:4352) 54 | 5. Fire off some test submissions! 55 | 56 | ``` 57 | for f in $(seq 1 20); do 58 | curl -X POST "http://localhost:8080/submit/test/$f/foo/bar/baz" -d "{\"test\":$f}" 59 | done 60 | ``` 61 | 62 | 6. Verify that your data was stored in the output file using the `heka-cat` utility 63 | 64 | ``` 65 | build/heka/build/heka/bin/heka-cat data_raw.out 66 | build/heka/build/heka/bin/heka-cat data_decoded.out 67 | ``` 68 | 69 | 7. Experiment with sandbox filters, outputs, and configurations. 70 | 71 | ### Useful things to know 72 | 73 | - GeoIP 74 | - It’s not terribly interesting to do GeoIP lookups on 127.0.0.1, so you may want to provide a `--header "X-Forwarded-For: 8.8.8.8"` argument to your curl commands. That will force a geoIP lookup on the specified IP address (Google’s DNS server in this example). 75 | - How to configure namespaces 76 | - The example config allows submissions to either `/submit/telemetry/docid/more/path/stuff` or `/submit/test/id/and/so/on` 77 | - You can add more endpoints by modifying the `namespace_config` parameter in `basic_local_pipeline.edge.toml`. 78 | - The namespace config is more manageable if you the JSON in a separate file, and run it through something like `jq -c '.' < my_namespaces.json` before putting it into the toml config. 79 | - Where to get more info about configuring heka 80 | - http://hekad.readthedocs.org/en/latest/index.html 81 | -------------------------------------------------------------------------------- /aws/aws_launcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | # ideas from 9 | # https://github.com/mozilla/telemetry-server/tree/master/provisioning/aws 10 | 11 | import argparse 12 | import json 13 | import sys 14 | import traceback 15 | import time 16 | 17 | try: 18 | import boto.ec2 19 | from boto.ec2.blockdevicemapping import BlockDeviceType 20 | from boto.ec2.blockdevicemapping import BlockDeviceMapping 21 | except: 22 | sys.stderr.write("Requires boto; try 'pip install boto'\n") 23 | exit(1) 24 | 25 | default_config = { 26 | "image": "ami-5189a661", 27 | "region": "us-west-2", 28 | "key_name": "20130730-svcops-base-key-dev", 29 | "instance_type": "c3.2xlarge", 30 | "security_groups": ["pipeline-analysis"], 31 | "iam_role": "pipeline-dev-iam-access-IamInstanceProfile-YVZ950U23IFP", 32 | "shutdown": "terminate", 33 | "ephemeral_map": { 34 | "/dev/xvdb": "ephemeral0", 35 | "/dev/xvdc": "ephemeral1" 36 | }, 37 | "owner": "datapipeline", 38 | "tags": { 39 | "App": "pipeline", 40 | "Type": "analysis", 41 | "Env": "dev", 42 | } 43 | } 44 | 45 | 46 | class Launcher(object): 47 | def __init__(self): 48 | parser = self.get_arg_parser() 49 | args = parser.parse_args() 50 | self.read_user_data() 51 | self.setup_config(args) 52 | 53 | def get_arg_parser(self): 54 | parser = argparse.ArgumentParser(description='Launch EC2 instances') 55 | parser.add_argument( 56 | "-c", "--config-file", 57 | help="JSON config file", 58 | type=file, 59 | default=None 60 | ) 61 | parser.add_argument( 62 | "-k", "--aws-key", 63 | help="AWS Key", 64 | default=None 65 | ) 66 | parser.add_argument( 67 | "-s", "--aws-secret-key", 68 | help="AWS Secret Key", 69 | default=None 70 | ) 71 | parser.add_argument( 72 | "-o", "--owner", 73 | help="AWS owner tag", 74 | default=None 75 | ) 76 | return parser 77 | 78 | def read_user_data(self): 79 | with open("userdata.sh", "r") as fh: 80 | self.user_data = fh.read() 81 | 82 | def setup_config(self, args): 83 | self.config = default_config.copy() 84 | if args.config_file: 85 | user_config = json.load(args.config_file) 86 | self.config.update(user_config) 87 | if args.aws_key: 88 | self.config["aws_key"] = args.aws_key 89 | if args.aws_secret_key: 90 | self.config["aws_secret_key"] = args.aws_secret_key 91 | if args.owner: 92 | self.config["owner"] = args.owner 93 | 94 | def fire_up_instance(self): 95 | self.conn = boto.ec2.connect_to_region( 96 | self.config["region"], 97 | aws_access_key_id=self.config.get("aws_key", None), 98 | aws_secret_access_key=self.config.get("aws_secret_key", None) 99 | ) 100 | 101 | mapping = BlockDeviceMapping() 102 | for device, eph_name in self.config["ephemeral_map"].iteritems(): 103 | mapping[device] = BlockDeviceType(ephemeral_name=eph_name) 104 | 105 | reservation = self.conn.run_instances( 106 | self.config["image"], 107 | key_name=self.config["key_name"], 108 | instance_type=self.config["instance_type"], 109 | security_groups=self.config["security_groups"], 110 | block_device_map=mapping, 111 | user_data=self.user_data, 112 | instance_profile_name=self.config["iam_role"], 113 | instance_initiated_shutdown_behavior=self.config["shutdown"] 114 | ) 115 | 116 | instance = reservation.instances[0] 117 | 118 | name_string = "{0}-{1}-{2}".format( 119 | self.config["owner"], 120 | self.config["tags"]["App"], 121 | self.config["tags"]["Type"]) 122 | owner_tags = {"Name": name_string, "Owner": self.config["owner"]} 123 | self.conn.create_tags([instance.id], owner_tags) 124 | self.conn.create_tags([instance.id], self.config["tags"]) 125 | 126 | while instance.state == 'pending': 127 | print "Instance is pending -- Waiting 10s for instance", \ 128 | instance.id, "to start up..." 129 | time.sleep(10) 130 | instance.update() 131 | 132 | print ("Instance {0} is {1}".format(instance.id, instance.state)) 133 | print ("ubuntu@{0}".format(instance.public_dns_name)) 134 | 135 | 136 | def main(): 137 | try: 138 | launcher = Launcher() 139 | launcher.fire_up_instance() 140 | return 0 141 | except Exception, e: 142 | print "Error:", e 143 | traceback.print_exc() 144 | return 1 145 | 146 | if __name__ == "__main__": 147 | sys.exit(main()) 148 | -------------------------------------------------------------------------------- /aws/userdata.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #sudo su - 4 | # As of 2015/09/10, build requires geoip >= 1.6.3. 5 | # Add a PPA for recent versions. 6 | add-apt-repository ppa:maxmind/ppa -y 7 | 8 | apt-get update 9 | apt-get --yes install mdadm xfsprogs jq git python-pip python-protobuf cmake libgeoip-dev zlib1g-dev mercurial debhelper libpq-dev libssl-dev 10 | pip install awscli boto 11 | umount /mnt 12 | yes | mdadm --create /dev/md0 --level=0 -c64 --raid-devices=2 /dev/xvdb /dev/xvdc 13 | echo 'DEVICE /dev/xvdb /dev/xvdc' >> /etc/mdadm/mdadm.conf 14 | mdadm --detail --scan >> /etc/mdadm/mdadm.conf 15 | mkfs.xfs /dev/md0 16 | mount /dev/md0 /mnt 17 | mkdir -p /mnt/work 18 | chown -R ubuntu:ubuntu /mnt/work 19 | 20 | cd /mnt/work 21 | wget https://storage.googleapis.com/golang/go1.4.2.linux-amd64.tar.gz 22 | tar -C /usr/local -xzf go1.4.2.linux-amd64.tar.gz 23 | 24 | wget http://people.mozilla.org/~mreid/heka-data-pipeline-linux-amd64.tar.gz 25 | tar xzvf heka-data-pipeline-linux-amd64.tar.gz 26 | 27 | echo "ubuntu hard nofile 200000" >> /etc/security/limits.conf 28 | echo "ubuntu soft nofile 50000" >> /etc/security/limits.conf 29 | -------------------------------------------------------------------------------- /bin/install_dependencies.osx.sh: -------------------------------------------------------------------------------- 1 | # As of 20160421, the latest cmake won't work. Install cmake 3.1 as a workaround. 2 | brew install openssl protobuf postgresql homebrew/versions/cmake31 3 | if [ -z "$(which go)" ]; then 4 | echo "You'll need to install go 1.4.x - see https://golang.org/dl/" 5 | fi 6 | -------------------------------------------------------------------------------- /doc/derived_streams.md: -------------------------------------------------------------------------------- 1 | ## Creating a derived stream 2 | 3 | - Follow the steps in the [`README`](../README.md) to set up a local pipeline. 4 | - Create a Sandbox Filter to extract the information you want. A simple example is the [`payload_size.lua`](../heka/sandbox/filters/payload_size.lua) filter. 5 | - Create a configuration file to test the filter during development. See [`payload_size_devel.toml`](../examples/payload_size_devel.toml) for an example config. For a derived stream based on Telemetry data, you will most likely use a `S3SplitFileInput` to read production data, and a `LogOutput` or `FileOutput` to view the resulting records locally. 6 | - Create a [JSON filter](../examples/payload_size_devel_filter.json) to limit the input data to a reasonable amount for testing. 7 | - Run it: 8 | ```bash 9 | export PATH=$PATH:build/heka/build/heka/bin 10 | hekad -config examples/payload_size_devel.toml 11 | # You should see several "payload_size" messages logged to the console. 12 | # Check the resulting file output: 13 | heka-cat derived_data.out 14 | ``` 15 | 16 | -------------------------------------------------------------------------------- /examples/basic_local_pipeline.toml: -------------------------------------------------------------------------------- 1 | [hekad] 2 | base_dir = "." 3 | share_dir = "." 4 | # 8MB 5 | max_message_size = 8388608 6 | 7 | [RstEncoder] 8 | 9 | [TestInput] 10 | type = "HttpListenInput" 11 | address = "127.0.0.1:8080" 12 | request_headers = ["Content-Length", "X-Forwarded-For", "DNT", "Date"] 13 | decoder = "HttpEdgeDecoder" 14 | send_decode_failures = true 15 | 16 | [LogOutput] 17 | # Print all incoming http messages (both raw and decoded) 18 | type = "LogOutput" 19 | message_matcher = "Type == 'http_edge_incoming' || Type == 'heka.httpdata.request'" 20 | #message_matcher = "TRUE" 21 | encoder = "RstEncoder" 22 | 23 | [HttpEdgeDecoder] 24 | type = "SandboxDecoder" 25 | filename = "heka/sandbox/decoders/http_edge_decoder.lua" 26 | memory_limit = 90000000 27 | output_limit = 8388608 28 | [HttpEdgeDecoder.config] 29 | geoip_city_db = "GeoLiteCity.dat" 30 | namespace_config = '{"test":{"logger":"test_input","max_path_length":20480,"max_data_length":1048576},"telemetry":{"dimensions":["reason","appName","appVersion","appUpdateChannel","appBuildID"],"max_path_length":10240,"max_data_length":204800}}' 31 | 32 | [DashboardOutput] 33 | address = "localhost:4352" 34 | static_directory = "build/heka/dasher" 35 | ticker_interval = 1 36 | 37 | [PayloadEncoder] 38 | 39 | [RequestRates] 40 | type = "SandboxFilter" 41 | message_matcher = "Type == 'http_edge_incoming'" 42 | filename = "examples/request_rates.lua" 43 | ticker_interval = 10 44 | preserve_data = true 45 | output_limit = 256000 46 | 47 | [ProtobufEncoder] 48 | 49 | [ArchivePipelineOutput] 50 | type = "FileOutput" 51 | path = "./data_decoded.out" 52 | use_framing = true 53 | message_matcher = "Type == 'http_edge_incoming'" 54 | encoder = "ProtobufEncoder" 55 | 56 | [ArchiveRawOutput] 57 | type = "FileOutput" 58 | path = "./data_raw.out" 59 | use_framing = true 60 | message_matcher = "Type == 'heka.httpdata.request'" 61 | encoder = "ProtobufEncoder" 62 | -------------------------------------------------------------------------------- /examples/decode_telemetry.toml: -------------------------------------------------------------------------------- 1 | [hekad] 2 | base_dir = "." 3 | share_dir = "." 4 | # 8MB 5 | max_message_size = 8388608 6 | 7 | [RstEncoder] 8 | 9 | [TestInput] 10 | type = "HttpListenInput" 11 | address = "127.0.0.1:8080" 12 | request_headers = ["Content-Length", "X-Forwarded-For", "DNT", "Date"] 13 | decoder = "TelemetryDecoders" 14 | send_decode_failures = true 15 | 16 | [LogOutput] 17 | # Print all incoming http messages (both raw and decoded) 18 | type = "LogOutput" 19 | #message_matcher = "Type == 'http_edge_incoming' || Type == 'heka.httpdata.request'" 20 | #message_matcher = "TRUE" 21 | message_matcher = "Logger != 'hekad'" 22 | encoder = "RstEncoder" 23 | 24 | [TelemetryDecoders] 25 | type = "MultiDecoder" 26 | subs = ["HttpEdgeDecoder", "TelemetryDecoder" , "ExecutiveSummary"] 27 | cascade_strategy = "all" 28 | log_sub_errors = true 29 | 30 | [HttpEdgeDecoder] 31 | type = "SandboxDecoder" 32 | filename = "heka/sandbox/decoders/http_edge_decoder.lua" 33 | memory_limit = 90000000 34 | output_limit = 8388608 35 | [HttpEdgeDecoder.config] 36 | geoip_city_db = "GeoLiteCity.dat" 37 | namespace_config = '{"test":{"logger":"test_input","max_path_length":20480,"max_data_length":1048576},"telemetry":{"dimensions":["docType","appName","appVersion","appUpdateChannel","appBuildId"],"max_path_length":10240,"max_data_length":204800},"sslreports":{"max_path_length":1024,"max_data_length":1048576}}' 38 | 39 | [TelemetryDecoder] 40 | type = "SandboxDecoder" 41 | filename = "heka/sandbox/decoders/extract_telemetry_dimensions.lua" 42 | memory_limit = 90000000 43 | output_limit = 2097152 44 | [TelemetryDecoder.config] 45 | duplicate_original = true 46 | 47 | [ExecutiveSummary] 48 | type = "SandboxDecoder" 49 | filename = "heka/sandbox/decoders/extract_executive_summary.lua" 50 | memory_limit = 90000000 51 | output_limit = 2097152 52 | [ExecutiveSummary.config] 53 | duplicate_original = true 54 | 55 | [DashboardOutput] 56 | address = "localhost:4352" 57 | static_directory = "build/heka/dasher" 58 | ticker_interval = 1 59 | 60 | [PayloadEncoder] 61 | [ProtobufEncoder] 62 | 63 | [TelemetryDecodedOutput] 64 | type = "FileOutput" 65 | path = "./data_decoded.out" 66 | use_framing = true 67 | message_matcher = "Logger == 'telemetry' && Type == 'telemetry'" 68 | encoder = "ProtobufEncoder" 69 | 70 | [TelemetryErrorOutput] 71 | type = "FileOutput" 72 | path = "./data_errors.out" 73 | use_framing = true 74 | message_matcher = "Logger == 'telemetry' && Type == 'telemetry.error'" 75 | encoder = "ProtobufEncoder" 76 | 77 | [TelemetryExecutiveSummaryOutput] 78 | type = "FileOutput" 79 | path = "./data_exsum.out" 80 | use_framing = true 81 | message_matcher = "Logger == 'fx' && Type == 'executive_summary'" 82 | encoder = "ProtobufEncoder" 83 | -------------------------------------------------------------------------------- /examples/monitor_dnt.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Monitor DNT header status 7 | 8 | *Example Heka Configuration* 9 | 10 | .. code-block:: ini 11 | 12 | [DNTUsage] 13 | type = "SandboxFilter" 14 | filename = "examples/monitor_dnt.lua" 15 | message_matcher = "Type == 'telemetry'" 16 | ticker_interval = 10 17 | preserve_data = true 18 | [DNTUsage.config] 19 | # Increment this if the format changes in a 20 | # backwards-incompatible way 21 | preservation_version = 1 22 | # Number of entries to keep in the circular buffer 23 | rows = 1440 24 | # Length of each bucket in the circular buffer 25 | sec_per_row = 300 26 | 27 | --]] 28 | _PRESERVATION_VERSION = read_config("preservation_version") or 0 29 | 30 | require "circular_buffer" 31 | 32 | -- Default to 2880 minute-long intervals 33 | local rows = read_config("rows") or 2880 34 | local sec_per_row = read_config("sec_per_row") or 60 35 | 36 | -- Create a circular buffer with three columns. It must 37 | -- be a global variable in order for 'preserve_data' to 38 | -- have any effect. 39 | c = circular_buffer.new(rows, 3, sec_per_row, true) 40 | 41 | -- Set the header names for the columns 42 | local ON = c:set_header(1, "DNT On") 43 | local OFF = c:set_header(2, "DNT Off") 44 | local UNK = c:set_header(3, "DNT Unknown") 45 | 46 | function process_message () 47 | local ts = read_message("Timestamp") 48 | local item = read_message("Fields[DNT]") 49 | 50 | if item == "1" then 51 | c:add(ts, ON, 1) 52 | elseif item == "0" then 53 | c:add(ts, OFF, 1) 54 | else 55 | c:add(ts, UNK, 1) 56 | end 57 | 58 | return 0 59 | end 60 | 61 | function timer_event(ns) 62 | -- Inject the entire circular buffer 63 | inject_payload("cbuf", "DNT Status", c:format("cbuf")) 64 | 65 | -- Inject the cbuf delta (changes since last timer event) 66 | inject_payload("cbufd", "DNT Status", c:format("cbufd")) 67 | end 68 | -------------------------------------------------------------------------------- /examples/payload_size_devel.toml: -------------------------------------------------------------------------------- 1 | [hekad] 2 | base_dir = "." 3 | share_dir = "." 4 | # 8MB 5 | max_message_size = 8388608 6 | 7 | # Decode S3 data 8 | [SnappyDecoder] 9 | [Multi] 10 | type = "MultiDecoder" 11 | subs = ["SnappyDecoder", "ProtobufDecoder"] 12 | cascade_strategy = "all" 13 | log_sub_errors = true 14 | 15 | # Read data from S3 16 | [DevInput] 17 | type = "S3SplitFileInput" 18 | s3_bucket = "net-mozaws-prod-us-west-2-pipeline-data" 19 | s3_bucket_prefix = "telemetry-2" 20 | s3_worker_count = 16 21 | s3_read_timeout = 600 22 | schema_file = "examples/payload_size_devel_filter.json" 23 | decoder = "Multi" 24 | 25 | # Generate the "payload_size" derived stream messages. 26 | [PayloadSize] 27 | type = "SandboxFilter" 28 | filename = "heka/sandbox/filters/payload_size.lua" 29 | message_matcher = "Type == 'telemetry' && Logger == 'telemetry'" 30 | ticker_interval = 0 31 | preserve_data = false 32 | 33 | 34 | # Print both derived-stream messages to the console 35 | [RstEncoder] 36 | [LogOutput] 37 | type = "LogOutput" 38 | message_matcher = "Type == 'heka.sandbox.payload_size'" 39 | encoder = "RstEncoder" 40 | 41 | # Write derived-stream messages to local disk 42 | [ProtobufEncoder] 43 | [ArchiveOutput] 44 | type = "FileOutput" 45 | path = "./derived_data.out" 46 | use_framing = true 47 | message_matcher = "Logger == 'PayloadSize' && Type == 'heka.sandbox.payload_size'" 48 | encoder = "ProtobufEncoder" 49 | -------------------------------------------------------------------------------- /examples/payload_size_devel_filter.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { "field_name": "submissionDate", "allowed_values": "20151001" }, 5 | { "field_name": "sourceName", "allowed_values": "telemetry" }, 6 | { "field_name": "sourceVersion", "allowed_values": "4" }, 7 | { "field_name": "docType", "allowed_values": "main" }, 8 | { "field_name": "appName", "allowed_values": "Firefox" }, 9 | { "field_name": "appUpdateChannel", "allowed_values": "nightly" }, 10 | { "field_name": "appVersion", "allowed_values": "42.0a1" }, 11 | { "field_name": "appBuildId", "allowed_values": "20150629134017" } 12 | ] 13 | } 14 | 15 | -------------------------------------------------------------------------------- /examples/request_rates.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | _PRESERVATION_VERSION = 1 5 | 6 | require "circular_buffer" 7 | 8 | local title = "Requests" 9 | local rows = read_config("rows") or 14400 10 | local sec_per_row = read_config("sec_per_row") or 60 11 | 12 | cbuf = circular_buffer.new(rows, 1, sec_per_row) 13 | cbuf:set_header(1, "Requests") 14 | 15 | function process_message () 16 | cbuf:add(read_message("Timestamp"), 1, 1) 17 | return 0 18 | end 19 | 20 | function timer_event(ns) 21 | inject_payload("cbuf", title, cbuf) 22 | end 23 | -------------------------------------------------------------------------------- /heka/cmd/heka-s3list/main.go: -------------------------------------------------------------------------------- 1 | /***** BEGIN LICENSE BLOCK ***** 2 | # This Source Code Form is subject to the terms of the Mozilla Public 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 4 | # You can obtain one at http://mozilla.org/MPL/2.0/. 5 | # ***** END LICENSE BLOCK *****/ 6 | 7 | /* 8 | 9 | A command-line utility for listing files on Amazon S3, filtered by dimension. 10 | 11 | */ 12 | package main 13 | 14 | import ( 15 | "flag" 16 | "fmt" 17 | "github.com/AdRoll/goamz/aws" 18 | "github.com/AdRoll/goamz/s3" 19 | "github.com/mozilla-services/data-pipeline/s3splitfile" 20 | "os" 21 | "time" 22 | ) 23 | 24 | func main() { 25 | flagSchema := flag.String("schema", "", "Filename of the schema to use as a filter") 26 | flagBucket := flag.String("bucket", "default-bucket", "S3 Bucket name") 27 | flagBucketPrefix := flag.String("bucket-prefix", "", "S3 Bucket path prefix") 28 | flagAWSKey := flag.String("aws-key", "", "AWS Key") 29 | flagAWSSecretKey := flag.String("aws-secret-key", "", "AWS Secret Key") 30 | flagAWSRegion := flag.String("aws-region", "us-west-2", "AWS Region") 31 | flagDryRun := flag.Bool("dry-run", false, "Don't actually do anything, just output what would be done") 32 | flagVerbose := flag.Bool("verbose", false, "Print detailed info") 33 | flag.Parse() 34 | 35 | if flag.NArg() != 0 { 36 | flag.PrintDefaults() 37 | os.Exit(1) 38 | } 39 | 40 | var err error 41 | var schema s3splitfile.Schema 42 | schema, err = s3splitfile.LoadSchema(*flagSchema) 43 | if err != nil { 44 | fmt.Printf("schema: %s\n", err) 45 | os.Exit(2) 46 | } 47 | 48 | if *flagDryRun { 49 | fmt.Printf("Dry Run: Would have listed files in s3://%s/%s according to filter schema %s\n", 50 | *flagBucket, *flagBucketPrefix, *flagSchema) 51 | os.Exit(0) 52 | } 53 | 54 | var b *s3.Bucket 55 | 56 | prefix := s3splitfile.CleanBucketPrefix(*flagBucketPrefix) 57 | 58 | // Initialize the S3 bucket 59 | auth, err := aws.GetAuth(*flagAWSKey, *flagAWSSecretKey, "", time.Now()) 60 | if err != nil { 61 | fmt.Printf("Authentication error: %s\n", err) 62 | os.Exit(4) 63 | } 64 | region, ok := aws.Regions[*flagAWSRegion] 65 | if !ok { 66 | fmt.Printf("Parameter 'aws-region' must be a valid AWS Region\n") 67 | os.Exit(5) 68 | } 69 | s := s3.New(auth, region) 70 | b = s.Bucket(*flagBucket) 71 | 72 | var errCount int 73 | var totalCount int 74 | var totalSize int64 75 | 76 | startTime := time.Now().UTC() 77 | 78 | // List the keys as we see them 79 | for k := range s3splitfile.S3Iterator(b, prefix, schema) { 80 | if k.Err != nil { 81 | fmt.Printf("ERROR fetching key: %s\n", k.Err) 82 | errCount++ 83 | } else { 84 | totalCount++ 85 | totalSize += k.Key.Size 86 | fmt.Printf("%s\n", k.Key.Key) 87 | } 88 | } 89 | 90 | duration := time.Now().UTC().Sub(startTime).Seconds() 91 | 92 | if *flagVerbose { 93 | fmt.Printf("Filter matched %d files totaling %s in %.02fs (%d errors)\n", 94 | totalCount, s3splitfile.PrettySize(totalSize), duration, errCount) 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /heka/cmd/s3cat/main.go: -------------------------------------------------------------------------------- 1 | /***** BEGIN LICENSE BLOCK ***** 2 | # This Source Code Form is subject to the terms of the Mozilla Public 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 4 | # You can obtain one at http://mozilla.org/MPL/2.0/. 5 | # ***** END LICENSE BLOCK *****/ 6 | 7 | /* 8 | 9 | A command-line utility for fetching a set of files on Amazon S3 as a single data 10 | stream. 11 | 12 | */ 13 | package main 14 | 15 | import ( 16 | "bufio" 17 | "flag" 18 | "fmt" 19 | "github.com/AdRoll/goamz/aws" 20 | "github.com/AdRoll/goamz/s3" 21 | "io" 22 | "math" 23 | "os" 24 | "time" 25 | ) 26 | 27 | var bytesRead uint64 28 | 29 | func main() { 30 | flagStdin := flag.Bool("stdin", false, "read list of s3 key names from stdin") 31 | flagBucket := flag.String("bucket", "default-bucket", "S3 Bucket name") 32 | flagAWSKey := flag.String("aws-key", "", "AWS Key") 33 | flagAWSSecretKey := flag.String("aws-secret-key", "", "AWS Secret Key") 34 | flagAWSRegion := flag.String("aws-region", "us-west-2", "AWS Region") 35 | flagConnectTimeout := flag.Uint64("connect_timeout", 60, "Max seconds to wait for an S3 connection") 36 | flagReadTimeout := flag.Uint64("read_timeout", 300, "Max seconds to wait for an S3 file read to complete") 37 | flag.Parse() 38 | 39 | if !*flagStdin && flag.NArg() < 1 { 40 | flag.PrintDefaults() 41 | os.Exit(1) 42 | } 43 | 44 | var connectTimeout uint32 45 | if *flagConnectTimeout < math.MaxUint32 { 46 | connectTimeout = uint32(*flagConnectTimeout) 47 | } else { 48 | fmt.Fprintf(os.Stderr, "Connection Timeout is too large:%d.\n", flagConnectTimeout) 49 | os.Exit(8) 50 | } 51 | 52 | var readTimeout uint32 53 | if *flagReadTimeout < math.MaxUint32 { 54 | readTimeout = uint32(*flagReadTimeout) 55 | } else { 56 | fmt.Fprintf(os.Stderr, "Read Timeout is too large:%d.\n", flagReadTimeout) 57 | os.Exit(8) 58 | } 59 | 60 | auth, err := aws.GetAuth(*flagAWSKey, *flagAWSSecretKey, "", time.Now()) 61 | if err != nil { 62 | fmt.Fprintf(os.Stderr, "Authentication error: %s\n", err) 63 | os.Exit(4) 64 | } 65 | region, ok := aws.Regions[*flagAWSRegion] 66 | if !ok { 67 | fmt.Fprintf(os.Stderr, "Parameter 'aws-region' must be a valid AWS Region\n") 68 | os.Exit(5) 69 | } 70 | s := s3.New(auth, region) 71 | if connectTimeout > 0 { 72 | s.ConnectTimeout = time.Duration(connectTimeout) * time.Second 73 | } 74 | if readTimeout > 0 { 75 | s.ReadTimeout = time.Duration(readTimeout) * time.Second 76 | } 77 | bucket := s.Bucket(*flagBucket) 78 | 79 | startTime := time.Now().UTC() 80 | totalFiles := 0 81 | if *flagStdin { 82 | scanner := bufio.NewScanner(os.Stdin) 83 | for scanner.Scan() { 84 | filename := scanner.Text() 85 | totalFiles++ 86 | cat(bucket, filename) 87 | } 88 | } else { 89 | for _, filename := range flag.Args() { 90 | totalFiles++ 91 | cat(bucket, filename) 92 | } 93 | } 94 | 95 | duration := time.Now().UTC().Sub(startTime).Seconds() 96 | mb := float64(bytesRead) / 1024.0 / 1024.0 97 | if duration == 0.0 { 98 | duration = 1.0 99 | } 100 | fmt.Fprintf(os.Stderr, "All done processing %d files, %.2fMB in %.2f seconds (%.2fMB/s)\n", totalFiles, mb, duration, (mb / duration)) 101 | } 102 | 103 | // Cat the data from a single S3 key 104 | func cat(bucket *s3.Bucket, s3Key string) { 105 | var lastGoodOffset uint64 106 | 107 | RetryS3: 108 | for attempt := 1; attempt <= 5; attempt++ { 109 | rc, err := getS3Reader(bucket, s3Key, lastGoodOffset) 110 | if err != nil && err != io.EOF { 111 | fmt.Fprintf(os.Stderr, "Error in attempt %d reading %s at offset %d: %s\n", attempt, s3Key, lastGoodOffset, err) 112 | continue RetryS3 113 | } else { 114 | nr := bufio.NewReader(rc) 115 | n, err := nr.WriteTo(os.Stdout) 116 | if err != nil && err != io.EOF { 117 | fmt.Fprintf(os.Stderr, "Error in attempt %d writing %s at offset %d: %s\n", attempt, s3Key, lastGoodOffset, err) 118 | rc.Close() 119 | if err.Error() == "write /dev/stdout: broken pipe" { 120 | os.Exit(1) 121 | } 122 | continue RetryS3 123 | } 124 | lastGoodOffset += uint64(n) 125 | bytesRead += uint64(n) 126 | } 127 | rc.Close() 128 | break 129 | } 130 | } 131 | 132 | // Callers must call Close() on rc. 133 | func getS3Reader(bucket *s3.Bucket, s3Key string, offset uint64) (rc io.ReadCloser, err error) { 134 | if offset == 0 { 135 | rc, err = bucket.GetReader(s3Key) 136 | return 137 | } 138 | 139 | headers := map[string][]string{ 140 | "Range": []string{fmt.Sprintf("bytes=%d-", offset)}, 141 | } 142 | 143 | resp, err := bucket.GetResponseWithHeaders(s3Key, headers) 144 | 145 | if resp != nil { 146 | rc = resp.Body 147 | } 148 | return 149 | } 150 | -------------------------------------------------------------------------------- /heka/patches/0002-Add-cmdline-tool-for-uploading-to-S3.patch: -------------------------------------------------------------------------------- 1 | From 70654e1d8f917f5e97a8305dd5a632ebe086d252 Mon Sep 17 00:00:00 2001 2 | From: Mark Reid 3 | Date: Mon, 12 Jan 2015 09:25:48 -0400 4 | Subject: [PATCH] Add cmdline tool for uploading to S3. 5 | 6 | --- 7 | CMakeLists.txt | 8 +++ 8 | 1 file changed, 8 insertions(+) 9 | 10 | diff --git a/CMakeLists.txt b/CMakeLists.txt 11 | index 868bf50..602deac 100644 12 | --- a/CMakeLists.txt 13 | +++ b/CMakeLists.txt 14 | @@ -36,6 +36,7 @@ set(SBMGRLOAD_EXE "${PROJECT_PATH}/bin/heka-sbmgrload${CMAKE_EXECUTABLE_SUFFIX}" 15 | set(INJECT_EXE "${PROJECT_PATH}/bin/heka-inject${CMAKE_EXECUTABLE_SUFFIX}") 16 | set(LOGSTREAMER_EXE "${PROJECT_PATH}/bin/heka-logstreamer${CMAKE_EXECUTABLE_SUFFIX}") 17 | set(HEKA_CAT_EXE "${PROJECT_PATH}/bin/heka-cat${CMAKE_EXECUTABLE_SUFFIX}") 18 | +set(HEKA_EXPORT_EXE "${PROJECT_PATH}/bin/heka-export${CMAKE_EXECUTABLE_SUFFIX}") 19 | 20 | option(INCLUDE_SANDBOX "Include Lua sandbox" on) 21 | option(INCLUDE_MOZSVC "Include the Mozilla services plugins" on) 22 | @@ -217,6 +218,13 @@ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) 23 | 24 | install(PROGRAMS "${HEKA_CAT_EXE}" DESTINATION bin) 25 | 26 | +add_custom_target(heka-export ALL 27 | +${GO_EXECUTABLE} install ${LDFLAGS} github.com/mozilla-services/heka/cmd/heka-export 28 | +DEPENDS hekad 29 | +WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) 30 | + 31 | +install(PROGRAMS "${HEKA_EXPORT_EXE}" DESTINATION bin) 32 | + 33 | add_custom_target(sbmgr ALL 34 | ${GO_EXECUTABLE} install ${LDFLAGS} github.com/mozilla-services/heka/cmd/heka-sbmgr 35 | DEPENDS hekad) 36 | -------------------------------------------------------------------------------- /heka/patches/0003-Add-more-cmds.patch: -------------------------------------------------------------------------------- 1 | From b31a2ce9ab6d3f5cf744c8234fd145ae5c14a786 Mon Sep 17 00:00:00 2001 2 | From: Mark Reid 3 | Date: Wed, 4 Feb 2015 17:10:10 -0400 4 | Subject: [PATCH] Update build to include heka-s3list and heka-s3cat 5 | 6 | --- 7 | CMakeLists.txt | 16 ++++++++++++++++ 8 | 1 file changed, 16 insertions(+) 9 | 10 | diff --git a/CMakeLists.txt b/CMakeLists.txt 11 | index a5cdd21..705d223 100644 12 | --- a/CMakeLists.txt 13 | +++ b/CMakeLists.txt 14 | @@ -37,6 +37,9 @@ set(INJECT_EXE "${PROJECT_PATH}/bin/heka-inject${CMAKE_EXECUTABLE_SUFFIX}") 15 | set(LOGSTREAMER_EXE "${PROJECT_PATH}/bin/heka-logstreamer${CMAKE_EXECUTABLE_SUFFIX}") 16 | set(HEKA_CAT_EXE "${PROJECT_PATH}/bin/heka-cat${CMAKE_EXECUTABLE_SUFFIX}") 17 | set(HEKA_EXPORT_EXE "${PROJECT_PATH}/bin/heka-export${CMAKE_EXECUTABLE_SUFFIX}") 18 | +set(HEKA_S3LIST_EXE "${PROJECT_PATH}/bin/heka-s3list${CMAKE_EXECUTABLE_SUFFIX}") 19 | +set(HEKA_S3CAT_EXE "${PROJECT_PATH}/bin/heka-s3cat${CMAKE_EXECUTABLE_SUFFIX}") 20 | +set(S3CAT_EXE "${PROJECT_PATH}/bin/s3cat${CMAKE_EXECUTABLE_SUFFIX}") 21 | 22 | option(INCLUDE_SANDBOX "Include Lua sandbox" on) 23 | option(INCLUDE_MOZSVC "Include the Mozilla services plugins" on) 24 | @@ -225,6 +227,27 @@ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) 25 | 26 | install(PROGRAMS "${HEKA_EXPORT_EXE}" DESTINATION bin) 27 | 28 | +add_custom_target(heka-s3list ALL 29 | +${GO_EXECUTABLE} install ${LDFLAGS} github.com/mozilla-services/heka/cmd/heka-s3list 30 | +DEPENDS hekad 31 | +WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) 32 | + 33 | +install(PROGRAMS "${HEKA_S3LIST_EXE}" DESTINATION bin) 34 | + 35 | +add_custom_target(heka-s3cat ALL 36 | +${GO_EXECUTABLE} install ${LDFLAGS} github.com/mozilla-services/heka/cmd/heka-s3cat 37 | +DEPENDS hekad 38 | +WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) 39 | + 40 | +install(PROGRAMS "${HEKA_S3CAT_EXE}" DESTINATION bin) 41 | + 42 | +add_custom_target(s3cat ALL 43 | +${GO_EXECUTABLE} install ${LDFLAGS} github.com/mozilla-services/heka/cmd/s3cat 44 | +DEPENDS hekad 45 | +WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) 46 | + 47 | +install(PROGRAMS "${S3CAT_EXE}" DESTINATION bin) 48 | + 49 | add_custom_target(sbmgr ALL 50 | ${GO_EXECUTABLE} install ${LDFLAGS} github.com/mozilla-services/heka/cmd/heka-sbmgr 51 | DEPENDS hekad) 52 | -- 53 | 1.9.4 (Apple Git-50.2) 54 | 55 | -------------------------------------------------------------------------------- /heka/plugins/fx/common.c: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | /** @brief Lua cuckoo filter common functions @file */ 8 | 9 | #include "common.h" 10 | 11 | unsigned clp2(unsigned x) 12 | { 13 | x = x - 1; 14 | x = x | (x >> 1); 15 | x = x | (x >> 2); 16 | x = x | (x >> 4); 17 | x = x | (x >> 8); 18 | x = x | (x >> 16); 19 | return x + 1; 20 | } 21 | 22 | 23 | int nlz(unsigned x) 24 | { 25 | int n; 26 | 27 | if (x == 0) return 32; 28 | n = 1; 29 | if ((x >> 16) == 0) {n = n + 16; x = x << 16;} 30 | if ((x >> 24) == 0) {n = n + 8; x = x << 8;} 31 | if ((x >> 28) == 0) {n = n + 4; x = x << 4;} 32 | if ((x >> 30) == 0) {n = n + 2; x = x << 2;} 33 | n = n - (x >> 31); 34 | return n; 35 | } 36 | 37 | 38 | unsigned fingerprint(unsigned h) 39 | { 40 | h = h >> 16; 41 | return h ? h : 1; 42 | } 43 | -------------------------------------------------------------------------------- /heka/plugins/fx/common.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | /** @brief Lua cuckoo filter common functions @file */ 8 | 9 | #ifndef common_h_ 10 | #define common_h_ 11 | 12 | #define BUCKET_SIZE 4 13 | 14 | /** 15 | * Hacker's Delight - Henry S. Warren, Jr. page 48 16 | * 17 | * @param x 18 | * 19 | * @return unsigned Least power of 2 greater than or equal to x 20 | */ 21 | unsigned clp2(unsigned x); 22 | 23 | /** 24 | * Hacker's Delight - Henry S. Warren, Jr. page 78 25 | * 26 | * @param x 27 | * 28 | * @return int Number of leading zeros 29 | */ 30 | int nlz(unsigned x); 31 | 32 | /** 33 | * Turn the unsigned value into a 16 bit fingerprint 34 | * 35 | * @param h 36 | * 37 | * @return unsigned 38 | */ 39 | unsigned fingerprint(unsigned h); 40 | #endif 41 | -------------------------------------------------------------------------------- /heka/plugins/hash/lua_hash.c: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | /** @brief Lua hash functions @file */ 8 | 9 | #include "lauxlib.h" 10 | #include "lua.h" 11 | #include 12 | 13 | static int zlib_adler32(lua_State* lua) 14 | { 15 | size_t len; 16 | const char* buf; 17 | 18 | if (lua_type(lua, 1) == LUA_TSTRING) { 19 | buf = lua_tolstring(lua, 1, &len); 20 | } else { 21 | return luaL_argerror(lua, 1, "must be a string"); 22 | } 23 | 24 | uLong adler = adler32(0L, Z_NULL, 0); 25 | adler = adler32(adler, buf, len); 26 | lua_pushinteger(lua, adler); 27 | 28 | return 1; 29 | } 30 | 31 | 32 | static int zlib_crc32(lua_State* lua) 33 | { 34 | size_t len; 35 | const char* buf; 36 | 37 | if (lua_type(lua, 1) == LUA_TSTRING) { 38 | buf = lua_tolstring(lua, 1, &len); 39 | } else { 40 | return luaL_argerror(lua, 1, "must be a string"); 41 | } 42 | 43 | uLong crc = crc32(0L, Z_NULL, 0); 44 | crc = crc32(crc, buf, len); 45 | lua_pushinteger(lua, crc); 46 | 47 | return 1; 48 | } 49 | 50 | 51 | static const struct luaL_reg hashlib_f[] = 52 | { 53 | { "adler32", zlib_adler32 } 54 | , { "crc32", zlib_crc32 } 55 | , { NULL, NULL } 56 | }; 57 | 58 | 59 | int luaopen_hash(lua_State* lua) 60 | { 61 | luaL_register(lua, "hash", hashlib_f); 62 | return 1; 63 | } 64 | -------------------------------------------------------------------------------- /heka/plugins/kafkaconsumergroup/kafka_consumer_group_input_test.go: -------------------------------------------------------------------------------- 1 | /***** BEGIN LICENSE BLOCK ***** 2 | # This Source Code Form is subject to the terms of the Mozilla Public 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 4 | # You can obtain one at http://mozilla.org/MPL/2.0/. 5 | # 6 | # The Initial Developer of the Original Code is the Mozilla Foundation. 7 | # Portions created by the Initial Developer are Copyright (C) 2014-2015 8 | # the Initial Developer. All Rights Reserved. 9 | # 10 | # Contributor(s): 11 | # Mike Trinkala (trink@mozilla.com) 12 | # Rob Miller (rmiller@mozilla.com) 13 | # Wesley Dawson (whd@mozilla.com) 14 | # 15 | # ***** END LICENSE BLOCK *****/ 16 | 17 | package kafkaconsumergroup 18 | 19 | import ( 20 | "testing" 21 | 22 | . "github.com/mozilla-services/heka/pipeline" 23 | ) 24 | 25 | func TestEmptyZookeeperConnectionString(t *testing.T) { 26 | pConfig := NewPipelineConfig(nil) 27 | ki := new(KafkaConsumerGroupInput) 28 | ki.SetPipelineConfig(pConfig) 29 | config := ki.ConfigStruct().(*KafkaConsumerGroupInputConfig) 30 | config.ConsumerGroup = "test" 31 | config.Topics = []string{"test"} 32 | err := ki.Init(config) 33 | 34 | errmsg := "zookeeper_connection_string required" 35 | if err.Error() != errmsg { 36 | t.Errorf("Expected: %s, received: %s", errmsg, err) 37 | } 38 | } 39 | 40 | func TestBadZookeeperConnectionString(t *testing.T) { 41 | pConfig := NewPipelineConfig(nil) 42 | ki := new(KafkaConsumerGroupInput) 43 | ki.SetPipelineConfig(pConfig) 44 | config := ki.ConfigStruct().(*KafkaConsumerGroupInputConfig) 45 | config.ConsumerGroup = "test" 46 | config.Topics = []string{"test"} 47 | config.ZookeeperConnectionString = "::" 48 | err := ki.Init(config) 49 | 50 | errmsg := "too many colons in address ::" 51 | if err.Error() != errmsg { 52 | t.Errorf("Expected: %s, received: %s", errmsg, err) 53 | } 54 | } 55 | 56 | func TestInvalidOffsetMethod(t *testing.T) { 57 | pConfig := NewPipelineConfig(nil) 58 | ki := new(KafkaConsumerGroupInput) 59 | ki.SetName("test") 60 | ki.SetPipelineConfig(pConfig) 61 | 62 | config := ki.ConfigStruct().(*KafkaConsumerGroupInputConfig) 63 | config.ConsumerGroup = "test" 64 | config.Topics = []string{"test"} 65 | config.ZookeeperConnectionString = "localhost:2181" 66 | config.OffsetMethod = "last" 67 | err := ki.Init(config) 68 | 69 | errmsg := "invalid offset_method: last" 70 | if err.Error() != errmsg { 71 | t.Errorf("Expected: %s, received: %s", errmsg, err) 72 | } 73 | } 74 | 75 | func TestEmptyInputTopics(t *testing.T) { 76 | pConfig := NewPipelineConfig(nil) 77 | ki := new(KafkaConsumerGroupInput) 78 | ki.SetPipelineConfig(pConfig) 79 | config := ki.ConfigStruct().(*KafkaConsumerGroupInputConfig) 80 | config.ConsumerGroup = "test" 81 | config.ZookeeperConnectionString = "localhost:2181" 82 | err := ki.Init(config) 83 | 84 | errmsg := "topics required" 85 | if err.Error() != errmsg { 86 | t.Errorf("Expected: %s, received: %s", errmsg, err) 87 | } 88 | } 89 | 90 | func TestMissingConsumerGroup(t *testing.T) { 91 | pConfig := NewPipelineConfig(nil) 92 | ki := new(KafkaConsumerGroupInput) 93 | ki.SetPipelineConfig(pConfig) 94 | config := ki.ConfigStruct().(*KafkaConsumerGroupInputConfig) 95 | config.Topics = []string{"test"} 96 | config.ZookeeperConnectionString = "localhost:2181" 97 | err := ki.Init(config) 98 | 99 | errmsg := "consumer_group required" 100 | if err.Error() != errmsg { 101 | t.Errorf("Expected: %s, received: %s", errmsg, err) 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /heka/plugins/s3splitfile/all_specs_test.go: -------------------------------------------------------------------------------- 1 | /***** BEGIN LICENSE BLOCK ***** 2 | # This Source Code Form is subject to the terms of the Mozilla Public 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 4 | # You can obtain one at http://mozilla.org/MPL/2.0/. 5 | # ***** END LICENSE BLOCK *****/ 6 | 7 | package s3splitfile 8 | 9 | import ( 10 | "github.com/rafrombrc/gospec/src/gospec" 11 | "testing" 12 | ) 13 | 14 | func TestAllSpecs(t *testing.T) { 15 | r := gospec.NewRunner() 16 | r.Parallel = false 17 | 18 | r.AddSpec(S3SplitFileSpec) 19 | 20 | gospec.MainGoTest(r, t) 21 | } 22 | -------------------------------------------------------------------------------- /heka/plugins/s3splitfile/s3splitfile_common_test.go: -------------------------------------------------------------------------------- 1 | /***** BEGIN LICENSE BLOCK ***** 2 | # This Source Code Form is subject to the terms of the Mozilla Public 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 4 | # You can obtain one at http://mozilla.org/MPL/2.0/. 5 | # ***** END LICENSE BLOCK *****/ 6 | 7 | package s3splitfile 8 | 9 | import ( 10 | "github.com/mozilla-services/heka/message" 11 | . "github.com/mozilla-services/heka/pipeline" 12 | gs "github.com/rafrombrc/gospec/src/gospec" 13 | "path/filepath" 14 | ) 15 | 16 | func testFieldVal(c gs.Context, schema Schema, field string, actual string, expected string) { 17 | sVal, err := schema.GetValue(field, actual) 18 | c.Expect(err, gs.IsNil) 19 | c.Expect(sVal, gs.Equals, expected) 20 | } 21 | 22 | func S3SplitFileSpec(c gs.Context) { 23 | c.Specify("Sanitize dimensions", func() { 24 | c.Expect("hello_there", gs.Equals, SanitizeDimension("hello!there")) 25 | 26 | c.Expect("___________________________", gs.Equals, SanitizeDimension("!@#$%^&*(){}[]|+=-`~'\",<>?\x02")) 27 | }) 28 | 29 | c.Specify("JSON Schema", func() { 30 | schema, err := LoadSchema(filepath.Join(".", "testsupport", "schema.json")) 31 | c.Expect(err, gs.IsNil) 32 | 33 | c.Expect(len(schema.Fields), gs.Equals, 5) 34 | 35 | // Bogus field: 36 | _, err = schema.GetValue("bogus", "some value") 37 | c.Expect(err, gs.Not(gs.IsNil)) 38 | 39 | testFieldVal(c, schema, "any", "foo", "foo") 40 | testFieldVal(c, schema, "any", "Any value at all is acceptable!", "Any value at all is acceptable!") 41 | 42 | testFieldVal(c, schema, "list", "foo", "foo") 43 | testFieldVal(c, schema, "list", "bar", "bar") 44 | testFieldVal(c, schema, "list", "baz", "baz") 45 | testFieldVal(c, schema, "list", "quux", "OTHER") 46 | testFieldVal(c, schema, "list", "Some values are not acceptable!", "OTHER") 47 | 48 | testFieldVal(c, schema, "rangeMin", "aaa", "aaa") 49 | testFieldVal(c, schema, "rangeMin", "foo", "foo") 50 | testFieldVal(c, schema, "rangeMin", "bar", "bar") 51 | testFieldVal(c, schema, "rangeMin", "all values larger than 'aaa' are fine!", "all values larger than 'aaa' are fine!") 52 | testFieldVal(c, schema, "rangeMin", "100", "OTHER") 53 | 54 | testFieldVal(c, schema, "rangeMax", "all", "all") 55 | testFieldVal(c, schema, "rangeMax", "bar", "bar") 56 | testFieldVal(c, schema, "rangeMax", "bbb", "bbb") 57 | testFieldVal(c, schema, "rangeMax", "all values smaller than 'bbb' are fine!", "all values smaller than 'bbb' are fine!") 58 | testFieldVal(c, schema, "rangeMax", "100", "100") 59 | testFieldVal(c, schema, "rangeMax", "ccc", "OTHER") 60 | 61 | testFieldVal(c, schema, "range", "aaa", "aaa") 62 | testFieldVal(c, schema, "range", "all", "all") 63 | testFieldVal(c, schema, "range", "bar", "bar") 64 | testFieldVal(c, schema, "range", "bbb", "bbb") 65 | testFieldVal(c, schema, "range", "all values between 'aaa' and 'bbb' are fine!", "all values between 'aaa' and 'bbb' are fine!") 66 | testFieldVal(c, schema, "range", "100", "OTHER") 67 | testFieldVal(c, schema, "range", "aa0", "OTHER") 68 | testFieldVal(c, schema, "range", "bbc", "OTHER") 69 | testFieldVal(c, schema, "range", "ccc", "OTHER") 70 | }) 71 | 72 | c.Specify("Non-string fields", func() { 73 | schema, _ := LoadSchema(filepath.Join(".", "testsupport", "schema.json")) 74 | pack := NewPipelinePack(nil) 75 | 76 | // No fields 77 | dims := schema.GetDimensions(pack) 78 | c.Expect(dims[0], gs.Equals, "UNKNOWN") 79 | 80 | // Integer field 81 | f, _ := message.NewField("any", 1, "") 82 | pack.Message.AddField(f) 83 | dims = schema.GetDimensions(pack) 84 | c.Expect(dims[0], gs.Equals, "1") 85 | pack.Message.DeleteField(f) 86 | 87 | // Boolean field 88 | f, _ = message.NewField("any", true, "") 89 | pack.Message.AddField(f) 90 | dims = schema.GetDimensions(pack) 91 | c.Expect(dims[0], gs.Equals, "true") 92 | pack.Message.DeleteField(f) 93 | 94 | // Double field 95 | f, _ = message.NewField("any", 1.23, "") 96 | pack.Message.AddField(f) 97 | dims = schema.GetDimensions(pack) 98 | c.Expect(dims[0], gs.Equals, "1.23") 99 | pack.Message.DeleteField(f) 100 | 101 | // Empty string field 102 | f, _ = message.NewField("any", "", "") 103 | pack.Message.AddField(f) 104 | dims = schema.GetDimensions(pack) 105 | c.Expect(dims[0], gs.Equals, "UNKNOWN") 106 | pack.Message.DeleteField(f) 107 | 108 | }) 109 | } 110 | -------------------------------------------------------------------------------- /heka/plugins/s3splitfile/testsupport/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { "field_name": "any", "allowed_values": "*" }, 5 | { "field_name": "list", "allowed_values": ["foo", "bar", "baz"] }, 6 | { "field_name": "rangeMin", "allowed_values": { "min": "aaa" } }, 7 | { "field_name": "rangeMax", "allowed_values": { "max": "bbb" } }, 8 | { "field_name": "range", "allowed_values": { "min": "aaa", "max": "bbb" } } 9 | ] 10 | } 11 | 12 | -------------------------------------------------------------------------------- /heka/plugins/snap/snappy_decoder.go: -------------------------------------------------------------------------------- 1 | /***** BEGIN LICENSE BLOCK ***** 2 | # This Source Code Form is subject to the terms of the Mozilla Public 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 4 | # You can obtain one at http://mozilla.org/MPL/2.0/. 5 | # ***** END LICENSE BLOCK *****/ 6 | 7 | package snap 8 | 9 | import ( 10 | "github.com/golang/snappy" 11 | . "github.com/mozilla-services/heka/pipeline" 12 | ) 13 | 14 | // SnappyDecoder decompresses snappy-compressed Message bytes. 15 | type SnappyDecoder struct { 16 | } 17 | 18 | func (re *SnappyDecoder) Init(config interface{}) (err error) { 19 | return 20 | } 21 | 22 | func (re *SnappyDecoder) Decode(pack *PipelinePack) (packs []*PipelinePack, err error) { 23 | output, decodeErr := snappy.Decode(nil, pack.MsgBytes) 24 | 25 | packs = []*PipelinePack{pack} 26 | if decodeErr == nil { 27 | // Replace bytes with decoded data 28 | pack.MsgBytes = output 29 | } 30 | // If there is an error decoding snappy, maybe it wasn't compressed. We'll 31 | // return the original data and try to proceed. 32 | return 33 | } 34 | 35 | func init() { 36 | RegisterPlugin("SnappyDecoder", func() interface{} { 37 | return new(SnappyDecoder) 38 | }) 39 | } 40 | -------------------------------------------------------------------------------- /heka/plugins/snap/snappy_encoder.go: -------------------------------------------------------------------------------- 1 | /***** BEGIN LICENSE BLOCK ***** 2 | # This Source Code Form is subject to the terms of the Mozilla Public 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 4 | # You can obtain one at http://mozilla.org/MPL/2.0/. 5 | # ***** END LICENSE BLOCK *****/ 6 | 7 | package snap 8 | 9 | import ( 10 | "github.com/golang/snappy" 11 | . "github.com/mozilla-services/heka/pipeline" 12 | ) 13 | 14 | // SnappyEncoder compresses the Message bytes using snappy compression. Each 15 | // message is compressed separately. 16 | type SnappyEncoder struct { 17 | } 18 | 19 | func (re *SnappyEncoder) Init(config interface{}) (err error) { 20 | return 21 | } 22 | 23 | func (re *SnappyEncoder) Encode(pack *PipelinePack) (output []byte, err error) { 24 | output = snappy.Encode(nil, pack.MsgBytes) 25 | return output, nil 26 | } 27 | 28 | func init() { 29 | RegisterPlugin("SnappyEncoder", func() interface{} { 30 | return new(SnappyEncoder) 31 | }) 32 | } 33 | -------------------------------------------------------------------------------- /heka/sandbox/decoders/decompress_payload.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | require "string" 6 | local gzip = require "gzip" 7 | 8 | function process_message() 9 | local payload = read_message("Payload") 10 | local b1, b2 = string.byte(payload, 1, 2) 11 | 12 | if b1 == 0x1f and b2 == 0x8b then -- test for gzip magic header bytes 13 | local ok, result = pcall(gzip.decompress, payload) 14 | if not ok then 15 | return -1, result 16 | end 17 | write_message("Payload", result) 18 | end 19 | 20 | return 0 21 | end 22 | -------------------------------------------------------------------------------- /heka/sandbox/decoders/extract_fhr_dimensions.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | The decoder extracts the FHR partition dimensions from the large JSON payload 7 | and adds them as message fields to avoid additional down stream parsing; it also 8 | uses an IP address lookup to determine the submission's country of origin and 9 | adds it a as a message field. 10 | 11 | Config: 12 | 13 | - geoip_city_db (string) 14 | The fully qualified path to the GeoIP city database (if not in the default 15 | location). 16 | 17 | *Example Heka Configuration* 18 | 19 | .. code-block:: ini 20 | 21 | [FHRDecoder] 22 | type = "SandboxDecoder" 23 | filename = "extract_fhr_dimensions.lua" 24 | memory_limit = 30000000 25 | output_limit = 2097152 26 | 27 | # Default 28 | # [FHRDecoder.config] 29 | # geoip_city_db = "/usr/local/share/GeoIP/GeoIPCity.dat" 30 | 31 | *Example Heka Message* 32 | 33 | :Timestamp: 2014-07-19 17:23:35.060999936 +0000 UTC 34 | :Type: fhr_metadata 35 | :Hostname: ip-10-227-137-43 36 | :Pid: 0 37 | :Uuid: 2dfcbeb8-18d4-41b8-af50-aa055fd94831 38 | :Logger: fhr 39 | :Payload: {...} 40 | :EnvVersion: 41 | :Severity: 7 42 | :Fields: 43 | | name:"submissionDate" type:string value:"20140719" 44 | | name:"appVersion" type:string value:"30.0" 45 | | name:"appUpdateChannel" type:string value:"release" 46 | | name:"sourceVersion" type:string value:"2" 47 | | name:"clientID" type:string value:"a6d35999-2d8d-4c68-9c6b-fbe8c514e40e" 48 | | name:"os" type:string value:"Linux" 49 | | name:"geoCountry" type:string value:"GB" 50 | | name:"sourceName" type:string value:"fhr" 51 | | name:"vendor" type:string value:"Mozilla" 52 | | name:"appBuildID" type:string value:"20140608211622" 53 | | name:"appName" type:string value:"Firefox" 54 | --]] 55 | 56 | require "cjson" 57 | require 'geoip.city' 58 | require "os" 59 | 60 | local city_db = assert(geoip.city.open(read_config("geoip_city_db"))) 61 | 62 | local msg = { 63 | Timestamp = nil, 64 | Type = "fhr_metadata", 65 | Payload = nil, 66 | Fields = { sourceName = "fhr" } 67 | } 68 | 69 | local UNK_DIM = "UNKNOWN" 70 | local UNK_GEO = "??" 71 | 72 | function process_message() 73 | -- Carry forward payload 74 | msg.Payload = read_message("Payload") 75 | 76 | local ok, fhr = pcall(cjson.decode, msg.Payload) 77 | if not ok then return -1, fhr end 78 | 79 | msg.Fields.sourceVersion = tostring(fhr.version) or UNK_DIM 80 | 81 | local info 82 | if msg.Fields.sourceVersion == "1" then 83 | if type(fhr.data) ~= "table" then 84 | return -1, "missing object: data" 85 | end 86 | if type(fhr.data.last) ~= "table" then 87 | return -1, "missing object: data.last" 88 | end 89 | if type(fhr.data.last["org.mozilla.appInfo.appinfo"]) == "table" then 90 | info = fhr.data.last["org.mozilla.appInfo.appinfo"] 91 | elseif type(fhr.data.last["org.mozilla.appInfo.appinfo.1"]) == "table" then 92 | info = fhr.data.last["org.mozilla.appInfo.appinfo.1"] 93 | else 94 | return -1, "missing object: data.last[org.mozilla.appInfo.appinfo]" 95 | end 96 | elseif msg.Fields.sourceVersion == "2" then 97 | if type(fhr.geckoAppInfo) ~= "table" then 98 | return -1, "missing object: geckoAppInfo" 99 | end 100 | info = fhr.geckoAppInfo 101 | elseif msg.Fields.sourceVersion == "3" then 102 | -- Use v3 structure. 103 | if type(fhr.environments) ~= "table" then 104 | return -1, "missing object: environments" 105 | end 106 | if type(fhr.environments.current) ~= "table" then 107 | return -1, "missing object: environments.current" 108 | end 109 | if type(fhr.environments.current.geckoAppInfo) ~= "table" then 110 | return -1, "missing object: environments.current.geckoAppInfo" 111 | end 112 | info = fhr.environments.current.geckoAppInfo 113 | else 114 | return -1, "unknown payload version" 115 | end 116 | 117 | -- Get some more dimensions 118 | msg.Fields.appName = info.name or UNK_DIM 119 | msg.Fields.appVersion = info.version or UNK_DIM 120 | msg.Fields.appUpdateChannel = info.updateChannel or UNK_DIM 121 | 122 | -- Do not want default values for these. 123 | msg.Fields.appBuildID = info.appBuildID 124 | msg.Fields.os = info.os 125 | msg.Fields.vendor = info.vendor 126 | msg.Fields.clientID = fhr.clientID 127 | 128 | -- IP address lookup 129 | msg.Fields.geoCountry = city_db:query_by_addr(read_message("Fields[remote_addr]"), "country_code") or UNK_GEO 130 | 131 | -- Carry forward timestamp. 132 | msg.Timestamp = read_message("Timestamp") 133 | 134 | msg.Fields.submissionDate = os.date("%Y%m%d", msg.Timestamp / 1e9) 135 | 136 | -- Send new message along 137 | inject_message(msg) 138 | 139 | return 0 140 | end 141 | -------------------------------------------------------------------------------- /heka/sandbox/decoders/extract_tls_info.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Extract clock skew, issuer info and Subject / SAN match status from tls error 7 | reports. This decoder MUST NOT return failure due to the way the Heka 8 | MultiDecoder is implemented. 9 | --]] 10 | 11 | require "string" 12 | require "cjson" 13 | require "os" 14 | 15 | local openssl = require "openssl" 16 | local name = openssl.x509.name 17 | local asn1 = openssl.asn1 18 | 19 | local certPrefix = "-----BEGIN CERTIFICATE-----\n" 20 | local certSuffix = "-----END CERTIFICATE-----\n" 21 | 22 | local msg = { 23 | Type = "tls_report", 24 | Fields = {} 25 | } 26 | 27 | -- create PEM data from base64 encoded DER 28 | local function make_pem(data) 29 | local pem = certPrefix 30 | local offset = 1 31 | while offset <= data:len() do 32 | local stop = offset + 63 33 | if stop > data:len() then 34 | stop = data:len() 35 | end 36 | pem = pem .. data:sub(offset, stop) .. "\n" 37 | offset = stop + 1 38 | end 39 | return pem .. certSuffix 40 | end 41 | 42 | -- read and parse a certificate 43 | local function read_cert(data) 44 | local pem = make_pem(data) 45 | return pcall(openssl.x509.read, pem) 46 | end 47 | 48 | local function parse_cert(cert) 49 | return pcall(cert.parse, cert) 50 | end 51 | 52 | local duplicate_original = read_config("duplicate_original") 53 | 54 | function process_message() 55 | if duplicate_original then 56 | inject_message(read_message("raw")) 57 | end 58 | 59 | msg.Fields["submissionDate"] = read_message("Fields[submissionDate]") 60 | 61 | local payload = read_message("Fields[submission]") 62 | local ok, report = pcall(cjson.decode, payload) 63 | if not ok then return -1, report end 64 | 65 | -- copy over the expected fields 66 | local expected = { 67 | "hostname", 68 | "port", 69 | "timestamp", 70 | "errorCode", 71 | "failedCertChain", 72 | "userAgent", 73 | "version", 74 | "build", 75 | "product", 76 | "channel" 77 | } 78 | 79 | for i, fieldname in ipairs(expected) do 80 | local field = report[fieldname] 81 | -- ensure the field is not empty (and does not contain an empty table) 82 | if not ("table" == type(field) and next(field) == nil) then 83 | msg.Fields[fieldname] = field 84 | end 85 | end 86 | 87 | -- calculate the clock skew - in seconds, since os.time() returns those 88 | local reportTime = report["timestamp"] 89 | if "number" == type(reportTime) then 90 | -- skew will be positive if the remote timestamp is in the future 91 | local skew = reportTime - os.time() 92 | 93 | msg.Fields["skew"] = skew 94 | end 95 | 96 | -- extract the rootmost and end entity certificates 97 | local failedCertChain = report["failedCertChain"] 98 | local ee = nil 99 | local rootMost = nil 100 | if "table" == type(failedCertChain) then 101 | for i, cert in ipairs(failedCertChain) do 102 | if not ee then 103 | ee = cert 104 | end 105 | rootMost = cert 106 | end 107 | end 108 | 109 | -- get the issuer name from the root-most certificate 110 | if rootMost then 111 | local parsed = nil 112 | local ok, cert = read_cert(rootMost); 113 | if ok and cert then 114 | ok, parsed = parse_cert(cert) 115 | end 116 | if ok and parsed then 117 | local issuer = parsed["issuer"] 118 | if issuer then 119 | msg.Fields["rootIssuer"] = issuer:get_text("CN") 120 | end 121 | end 122 | end 123 | 124 | -- determine if the end entity subject or SAN matches the hostname 125 | local hostname = report["hostname"] 126 | if ee and hostname then 127 | local ok, cert = read_cert(ee); 128 | if ok and cert then 129 | local ok, matches = pcall(cert.check_host, cert, hostname) 130 | if ok and matches then 131 | msg.Fields["hostnameMatch"] = matches 132 | end 133 | end 134 | end 135 | 136 | inject_message(msg) 137 | return 0 138 | end 139 | -------------------------------------------------------------------------------- /heka/sandbox/encoders/combine_telemetry_objects.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | require "cjson" 6 | local l = require "lpeg" 7 | 8 | local grammar = (l.C"payload" + l.C"environment") * l.P"." * l.C(l.P(1)^1) 9 | 10 | function process_message() 11 | local raw = read_message("raw") 12 | local ok, msg = pcall(decode_message, raw) 13 | if not ok then return -1, msg end 14 | 15 | if type(msg.Fields) ~= "table" then return -1, "missing Fields" end 16 | 17 | local meta = { 18 | Timestamp = msg.Timestamp / 1e9, 19 | Type = msg.Type, 20 | Hostname = msg.Hostname, 21 | } 22 | 23 | local ok, json = pcall(cjson.decode, read_message("Payload")) 24 | if not ok then return -1, json end 25 | 26 | for i=1, #msg.Fields do 27 | local section, name = grammar:match(msg.Fields[i].name) 28 | if section then 29 | local ok, object = pcall(cjson.decode, msg.Fields[i].value[1]) 30 | if ok then 31 | json[section][name] = object 32 | end 33 | else 34 | meta[msg.Fields[i].name] = msg.Fields[i].value[1] 35 | end 36 | end 37 | 38 | local ok, jmeta = pcall(cjson.encode, meta) 39 | if not ok then return -1, jmeta end 40 | local ok, payload = pcall(cjson.encode, json) 41 | if not ok then return -1, payload end 42 | 43 | inject_payload("txt", "output", json.clientId, "\t[", jmeta, ",", payload, "]\n") 44 | return 0 45 | end 46 | -------------------------------------------------------------------------------- /heka/sandbox/filters/count_by_normalized_channel.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Request Counts by Normalized Channel 7 | 8 | *Example Heka Configuration* 9 | 10 | .. code-block:: ini 11 | 12 | [CountByNormalizedChannel] 13 | type = "SandboxFilter" 14 | filename = "lua_filters/count_by_normalized_channel.lua" 15 | message_matcher = "Logger == 'fx' && Type == 'executive_summary' && Fields[vendor] == 'Mozilla' && Fields[app] == 'Firefox'" 16 | ticker_interval = 60 17 | preserve_data = true 18 | 19 | --]] 20 | 21 | require "circular_buffer" 22 | fx = require "fx" 23 | 24 | local rows = read_config("rows") or 1440 25 | local sec_per_row = read_config("sec_per_row") or 60 26 | 27 | local nchannels = fx.get_channel_count() 28 | local channel_counter = circular_buffer.new(rows, nchannels, sec_per_row, true) 29 | for i=1,nchannels do 30 | -- Circular buffer columns are one-based, channel ids are zero-based. 31 | channel_counter:set_header(i, fx.get_channel_name(i - 1)) 32 | end 33 | 34 | function process_message() 35 | local ts = read_message("Timestamp") 36 | local normalized = fx.normalize_channel(read_message("Fields[channel]")) 37 | 38 | -- Need to add one to account for "Other" (which comes back as zero) 39 | local column_id = fx.get_channel_id(normalized) + 1 40 | channel_counter:add(ts, column_id, 1) 41 | return 0 42 | end 43 | 44 | local title = "Counts by Normalized Channel" 45 | function timer_event(ns) 46 | inject_payload("cbuf", title, channel_counter:format("cbuf")) 47 | inject_payload("cbufd", title, channel_counter:format("cbufd")) 48 | end 49 | -------------------------------------------------------------------------------- /heka/sandbox/filters/fhr_requests.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | FHR Request Counts 7 | 8 | *Example Heka Configuration* 9 | 10 | .. code-block:: ini 11 | 12 | [FHRRequestCount] 13 | type = "SandboxFilter" 14 | filename = "lua_filters/fhr_requests.lua" 15 | message_matcher = "Logger == 'fx' && Type == 'executive_summary'" 16 | ticker_interval = 60 17 | preserve_data = true 18 | 19 | --]] 20 | _PRESERVATION_VERSION = 1 21 | 22 | require "circular_buffer" 23 | local alert = require "alert" 24 | local annotation = require "annotation" 25 | local anomaly = require "anomaly" 26 | 27 | local title = "FHR Requests" 28 | local rows = read_config("rows") or 14400 29 | local sec_per_row = read_config("sec_per_row") or 60 30 | local anomaly_config = anomaly.parse_config(read_config("anomaly_config")) 31 | annotation.set_prune(title, rows * sec_per_row * 1e9) 32 | 33 | cbuf = circular_buffer.new(rows, 1, sec_per_row, true) 34 | cbuf:set_header(1, "Requests") 35 | 36 | function process_message () 37 | cbuf:add(read_message("Timestamp"), 1, 1) 38 | return 0 39 | end 40 | 41 | function timer_event(ns) 42 | if anomaly_config then 43 | if not alert.throttled(ns) then 44 | local msg, annos = anomaly.detect(ns, title, cbuf, anomaly_config) 45 | if msg then 46 | annotation.concat(title, annos) 47 | alert.send(ns, msg) 48 | end 49 | end 50 | inject_payload("cbuf", title, annotation.prune(title, ns), cbuf:format("cbuf")) 51 | else 52 | inject_payload("cbuf", title, cbuf:format("cbuf")) 53 | end 54 | inject_payload("cbufd", title, cbuf:format("cbufd")) 55 | end 56 | -------------------------------------------------------------------------------- /heka/sandbox/filters/firefox_active_instances.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Firefox Active Instances 7 | 8 | *Example Heka Configuration* 9 | 10 | .. code-block:: ini 11 | 12 | [FirefoxActiveInstances] 13 | type = "SandboxFilter" 14 | filename = "lua_filters/firefox_active_instances.lua" 15 | message_matcher = "Logger == 'fx' && Type == 'executive_summary' && Fields[vendor] == 'Mozilla' && Fields[app] == 'Firefox'" 16 | ticker_interval = 60 17 | preserve_data = true 18 | --]] 19 | require "circular_buffer" 20 | require "cjson" 21 | require "math" 22 | require "os" 23 | require "hyperloglog" 24 | 25 | local DAYS = 30 26 | local SEC_IN_DAY = 60 * 60 * 24 27 | local floor = math.floor 28 | local date = os.date 29 | 30 | day_cb = circular_buffer.new(DAYS, 1, SEC_IN_DAY, true) 31 | day_cb:set_header(1, "Active Instances") 32 | day_hll = {} 33 | for i=1,DAYS do 34 | day_hll[i] = hyperloglog.new() 35 | end 36 | current_day = -1 37 | 38 | local month_names = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", 39 | "Sep", "Oct", "Nov", "Dec"} 40 | local MONTHS = #month_names 41 | month_hll = {} 42 | for i=1,MONTHS do 43 | month_hll[i] = hyperloglog.new() 44 | end 45 | current_month = -1 46 | 47 | local function clear_days(s, e) 48 | for i = s + 1, e do 49 | local idx = i % DAYS + 1 50 | day_hll[idx]:clear() 51 | day_cb:set(i * SEC_IN_DAY * 1e9, 1, 0) 52 | end 53 | end 54 | 55 | local function update_day(ts, cid, day) 56 | if current_day == -1 then current_day = day end 57 | 58 | local delta = day - current_day 59 | if delta > 0 and delta < DAYS then 60 | clear_days(current_day, day) 61 | current_day = day 62 | elseif delta >= DAYS then 63 | clear_days(current_day, current_day + DAYS) 64 | current_day = current_day + delta 65 | elseif delta <= -DAYS then 66 | return -- ignore data in the past 67 | end 68 | local idx = day % DAYS + 1 69 | if day_hll[idx]:add(cid) then 70 | day_cb:set(ts, 1, day_hll[idx]:count()) 71 | end 72 | end 73 | 74 | local function clear_months(s, n) 75 | for i = 1, n do 76 | s = s + 1 77 | if s > MONTHS then s = 1 end 78 | month_hll[s]:clear() 79 | end 80 | end 81 | 82 | local function update_month(ts, cid, day_changed, day_advanced) 83 | local month = current_month 84 | if current_month == -1 or day_changed then 85 | local t = date("*t", ts / 1e9) 86 | month = tonumber(t.month) 87 | if current_month == -1 then current_month = month end 88 | end 89 | 90 | if day_advanced then 91 | local delta = month - current_month 92 | if delta > 0 then 93 | clear_months(current_month, delta) 94 | current_month = month 95 | elseif delta < 0 then -- roll over the year 96 | clear_months(current_month, MONTHS + delta) 97 | current_month = month 98 | end 99 | end 100 | 101 | month_hll[month]:add(cid) 102 | end 103 | 104 | ---- 105 | 106 | function process_message() 107 | local ts = read_message("Timestamp") 108 | local cid = read_message("Fields[clientId]") 109 | if type(cid) == "string" then 110 | local day = floor(ts / (SEC_IN_DAY * 1e9)) 111 | local day_changed = day ~= current_day 112 | local day_advanced = day > current_day 113 | update_day(ts, cid, day) 114 | update_month(ts, cid, day_changed, day_advanced) 115 | end 116 | return 0 117 | end 118 | 119 | local title = "Firefox Active Daily Instances" 120 | function timer_event(ns) 121 | inject_payload("cbuf", title, day_cb:format("cbuf")) 122 | inject_payload("cbufd", title, day_cb:format("cbufd")) 123 | 124 | local json = {} 125 | local idx = current_month 126 | if idx == -1 then idx = 0 end 127 | 128 | for i=1,MONTHS do 129 | idx = idx + 1 130 | if idx > MONTHS then idx = 1 end 131 | json[i] = {[month_names[idx]] = month_hll[idx]:count()} 132 | end 133 | inject_payload("json", "Firefox Active Monthly Instances", cjson.encode(json)) 134 | end 135 | -------------------------------------------------------------------------------- /heka/sandbox/filters/firefox_channel_switching.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Firefox Channel Switching 7 | 8 | *Example Heka Configuration* 9 | 10 | .. code-block:: ini 11 | 12 | [FirefoxChannelSwitching] 13 | type = "SandboxFilter" 14 | filename = "lua_filters/firefox_channel_switching.lua" 15 | message_matcher = "Logger == 'fx' && Type == 'executive_summary' && Fields[vendor] == 'Mozilla' && Fields[app] == 'Firefox'" 16 | memory_limit = 1000000000 17 | ticker_interval = 60 18 | preserve_data = true 19 | 20 | [FirefoxChannelSwitching.config] 21 | anomaly_config = 'mww_nonparametric("nightly", 3, 3, 4, 0.6) mww_nonparametric("beta", 3, 3, 4, 0.6)' 22 | --]] 23 | _PRESERVATION_VERSION = 1 24 | 25 | local fx = require "fx" 26 | require "circular_buffer" 27 | require "cuckoo_filter" 28 | local l = require "lpeg" 29 | require "string" 30 | 31 | local alert = require "alert" 32 | local annotation = require "annotation" 33 | local anomaly = require "anomaly" 34 | local anomaly_config = anomaly.parse_config(read_config("anomaly_config")) 35 | 36 | local rows = read_config("rows") or 180 37 | local sec_per_row = read_config("sec_per_row") or 60*60*24 38 | local COL_NEW = 1 39 | local COL_IN = 2 40 | local COL_OUT = 3 41 | 42 | local function create_cbuf() 43 | local cb = circular_buffer.new(rows, COL_OUT, sec_per_row, true) 44 | cb:set_header(COL_NEW , "new") 45 | cb:set_header(COL_IN , "switched in") 46 | cb:set_header(COL_OUT , "switched out") 47 | return cb 48 | end 49 | 50 | channels = { 51 | {name = "release" , cb = create_cbuf(), cf = cuckoo_filter.new(100e6)}, 52 | {name = "beta" , cb = create_cbuf(), cf = cuckoo_filter.new(10e6)}, 53 | {name = "nightly" , cb = create_cbuf(), cf = cuckoo_filter.new(1e6)}, 54 | -- aurora uses a different profile so we do not expect to see any switches 55 | {name = "aurora", cb = create_cbuf(), cf = cuckoo_filter.new(1e6)}, 56 | {name = "Other" , cb = create_cbuf(), cf = cuckoo_filter.new(100e6)}, 57 | } 58 | local CHANNELS_SIZE = #channels 59 | 60 | function process_message() 61 | local cid = read_message("Fields[clientId]") 62 | if not cid then return -1, "missing clientId" end 63 | 64 | local chan = read_message("Fields[channel]") 65 | if not chan then return -1, "missing channel" end 66 | 67 | chan = fx.normalize_channel(chan) 68 | 69 | local ts = read_message("Timestamp") 70 | local matched, added, deleted = nil, false, false 71 | for i=1, CHANNELS_SIZE do 72 | local v = channels[i] 73 | if v.name == chan then 74 | added = v.cf:add(cid) 75 | matched = v 76 | else 77 | if v.cf:delete(cid) then 78 | v.cb:add(ts, COL_OUT, 1) 79 | deleted = true 80 | end 81 | end 82 | 83 | end 84 | 85 | if added then 86 | if deleted then 87 | matched.cb:add(ts, COL_IN, 1) 88 | else 89 | matched.cb:add(ts, COL_NEW, 1) 90 | end 91 | end 92 | 93 | return 0 94 | end 95 | 96 | function timer_event(ns) 97 | for i,v in ipairs(channels) do 98 | if anomaly_config then 99 | if not alert.throttled(ns) then 100 | local msg, annos = anomaly.detect(ns, v.name, v.cb, anomaly_config) 101 | if msg then 102 | alert.queue(ns, msg) 103 | annotation.concat(v.name, annos) 104 | end 105 | end 106 | local a = annotation.prune(v.name, ns) 107 | if a then 108 | inject_payload("cbuf", v.name, a, v.cb:format("cbuf")) 109 | else 110 | inject_payload("cbuf", v.name, v.cb:format("cbuf")) 111 | end 112 | else 113 | inject_payload("cbuf", v.name, v.cb:format("cbuf")) 114 | end 115 | inject_payload("cbufd", v.name, v.cb:format("cbufd")) 116 | end 117 | alert.send_queue(ns) 118 | end 119 | -------------------------------------------------------------------------------- /heka/sandbox/filters/firefox_duplicates.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Firefox Duplicate Telemetry Submission Report 7 | 8 | *Example Heka Configuration* 9 | 10 | .. code-block:: ini 11 | 12 | [FirefoxDuplicates] 13 | type = "SandboxFilter" 14 | filename = "lua_filters/firefox_duplicates.lua" 15 | message_matcher = "Logger == 'fx' && Type == 'executive_summary' && Fields[vendor] == 'Mozilla' && Fields[app] == 'Firefox'" 16 | output_limit = 0 17 | memory_limit = 0 18 | instruction_limit = 0 19 | ticker_interval = 0 20 | preserve_data = false 21 | timer_event_on_shutdown = true 22 | 23 | [FirefoxDuplicates.config] 24 | items = 100000000 25 | --]] 26 | 27 | require "bloom_filter" 28 | require "circular_buffer" 29 | local fx = require "fx" 30 | 31 | local items = read_config("items") or 1000000 32 | local probability = read_config("probability") or 0.01 33 | bf = bloom_filter.new(items, probability) 34 | 35 | local cols = fx.get_channel_count() 36 | cb = circular_buffer.new(180, cols, 60*60*24, true) 37 | for i=1, cols do 38 | cb:set_header(i, fx.get_channel_name(i-1)) 39 | end 40 | 41 | cids = {} 42 | 43 | function process_message() 44 | local did = read_message("Fields[documentId]") 45 | if type(did) == "string" then 46 | if not bf:add(did) then 47 | local ts = read_message("Timestamp") 48 | local channel = read_message("Fields[channel]") 49 | cb:add(ts, fx.get_channel_id(channel) + 1, 1) 50 | 51 | local cid = read_message("Fields[clientId]") 52 | if type(cid) == "string" then 53 | cids[cid] = true 54 | end 55 | end 56 | end 57 | return 0 58 | end 59 | 60 | local title = "graph" 61 | function timer_event(ns) 62 | inject_payload("cbuf", title, cb:format("cbuf")) 63 | inject_payload("cbufd", title, cb:format("cbufd")) 64 | 65 | local found = false 66 | for k,_ in pairs(cids) do 67 | add_to_payload(k, "\n") 68 | found = true 69 | end 70 | 71 | if found then 72 | inject_payload("txt", "clients") 73 | cids = {} 74 | end 75 | end 76 | -------------------------------------------------------------------------------- /heka/sandbox/filters/firefox_searches.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Calculates search totals by engine, origin, and country. 7 | 8 | Config: 9 | 10 | *Example Heka Configuration* 11 | 12 | .. code-block:: ini 13 | 14 | [FirefoxSearches] 15 | type = "SandboxFilter" 16 | filename = "lua_filters/firefox_searches.lua" 17 | message_matcher = "Type == 'telemetry' && Fields[docType] == 'main' && Fields[appName] == 'Firefox' && Fields[appVendor] == 'Mozilla'" 18 | ticker_interval = 60 19 | output_limit = 512000 20 | preserve_data = true 21 | --]] 22 | _PRESERVATION_VERSION = 1 23 | 24 | require "cjson" 25 | require "circular_buffer" 26 | require "math" 27 | require "os" 28 | require "string" 29 | 30 | local ROWS = 365 31 | local SEC_PER_ROW = 60 * 60 * 24 32 | 33 | local origins = {"abouthome", "contextmenu", "searchbar", "urlbar", "total"} 34 | local ORIGINS_SIZE = #origins 35 | 36 | local countries = {"US", "CN", "RU", "Total"} 37 | local COUNTRIES_SIZE = #countries 38 | 39 | local function make_cbuf() 40 | local cb = circular_buffer.new(ROWS, ORIGINS_SIZE, SEC_PER_ROW, true) 41 | for i, v in ipairs(origins) do 42 | cb:set_header(i, v) 43 | end 44 | return cb 45 | end 46 | 47 | engines = { 48 | {name = "Bing" , cbuf = make_cbuf(), match = "[Bb]ing"}, 49 | {name = "Google", cbuf = make_cbuf(), match = "[Gg]oogle"}, 50 | {name = "Yahoo" , cbuf = make_cbuf(), match = "[Yy]ahoo"}, 51 | {name = "Other" , cbuf = make_cbuf(), match = "."} 52 | } 53 | 54 | totals = circular_buffer.new(ROWS, #engines * COUNTRIES_SIZE, SEC_PER_ROW, true) 55 | for i, v in ipairs(engines) do 56 | for j, c in ipairs(countries) do 57 | totals:set_header((i-1) * COUNTRIES_SIZE + j, string.format("%s_%s", v.name, c)) 58 | end 59 | end 60 | 61 | local time = os.time 62 | function process_message () 63 | local json = read_message("Fields[payload.keyedHistograms]") 64 | if not json then return -1, "no keyedHistograms" end 65 | 66 | local ok, khist = pcall(cjson.decode, json) 67 | if not ok then return -1, khist end 68 | if type(khist.SEARCH_COUNTS) ~= "table" then return -1, "no SEARCH_COUNTS" end 69 | 70 | local ts = read_message("Timestamp") 71 | for k, v in pairs(khist.SEARCH_COUNTS) do 72 | for i, e in ipairs(engines) do 73 | if string.match(k, e.match) then 74 | if type(v.sum) ~= "number" then return -1, string.format("missing %s.sum", k) end 75 | local c = v.sum 76 | local cc = read_message("Fields[geoCountry]") 77 | for n = 1, COUNTRIES_SIZE - 1 do 78 | if cc == countries[n] then 79 | totals:add(ts, (i-1) * COUNTRIES_SIZE + n, c) 80 | break 81 | end 82 | end 83 | totals:add(ts, (i-1) * COUNTRIES_SIZE + COUNTRIES_SIZE, c) 84 | 85 | for n = 1, ORIGINS_SIZE - 1 do 86 | if string.match(k, origins[n]) then 87 | e.cbuf:add(ts, n, c) 88 | break 89 | end 90 | end 91 | e.cbuf:add(ts, ORIGINS_SIZE, c) 92 | break 93 | end 94 | end 95 | end 96 | return 0 97 | end 98 | 99 | local floor = math.floor 100 | local date = os.date 101 | local json = {} 102 | for i=1, ROWS do 103 | json[i] = {date = "", time_t = 0} 104 | for m, e in ipairs(engines) do 105 | local t = {} 106 | json[i][e.name] = t 107 | for j, c in ipairs(countries) do 108 | t[c] = 0 109 | end 110 | end 111 | end 112 | 113 | local title = "Totals" 114 | function timer_event(ns) 115 | for i, v in ipairs(engines) do 116 | inject_payload("cbuf", v.name, v.cbuf:format("cbuf")) 117 | inject_payload("cbufd", v.name, v.cbuf:format("cbufd")) 118 | end 119 | inject_payload("cbuf", title, totals:format("cbuf")) 120 | inject_payload("cbufd", title, totals:format("cbufd")) 121 | 122 | local ts = totals:current_time() - (ROWS - 1) * SEC_PER_ROW * 1e9 123 | for i, v in ipairs(json) do 124 | v.time_t = floor(ts/1e9) 125 | v.date = date("%F", v.time_t) 126 | for m, e in ipairs(engines) do 127 | for j, c in ipairs(countries) do 128 | local val = totals:get(ts, (m-1) * COUNTRIES_SIZE + j) 129 | if val ~= val then val = 0 end 130 | v[e.name][c] = val 131 | end 132 | end 133 | ts = ts + SEC_PER_ROW * 1e9 134 | end 135 | inject_payload("json", "totals", cjson.encode(json)) 136 | end 137 | -------------------------------------------------------------------------------- /heka/sandbox/filters/firefox_usage.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Firefox Usage Hours 7 | 8 | *Example Heka Configuration* 9 | 10 | .. code-block:: ini 11 | 12 | [FirefoxUsage] 13 | type = "SandboxFilter" 14 | filename = "lua_filters/firefox_usage.lua" 15 | message_matcher = "Logger == 'fx' && Type == 'executive_summary' && Fields[docType] == 'main' && Fields[vendor] == 'Mozilla' && Fields[app] == 'Firefox'" 16 | ticker_interval = 60 17 | preserve_data = true 18 | --]] 19 | 20 | require "circular_buffer" 21 | require "cjson" 22 | require "math" 23 | require "os" 24 | require "string" 25 | 26 | local DAYS = 30 27 | local SEC_IN_DAY = 60 * 60 * 24 28 | local floor = math.floor 29 | local date = os.date 30 | 31 | day_cb = circular_buffer.new(DAYS, 1, SEC_IN_DAY, true) 32 | day_cb:set_header(1, "Active Hours") 33 | current_day = -1 34 | 35 | local month_names = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", 36 | "Sep", "Oct", "Nov", "Dec"} 37 | local MONTHS = #month_names 38 | months = {} 39 | for i=1,MONTHS do 40 | months[i] = 0 41 | end 42 | current_month = -1 43 | 44 | local function clear_months(s, n) 45 | for i = 1, n do 46 | s = s + 1 47 | if s > MONTHS then s = 1 end 48 | months[s] = 0 49 | end 50 | end 51 | 52 | local function update_month(ts, uptime, day_changed, day_advanced) 53 | local month = current_month 54 | if current_month == -1 or day_changed then 55 | local t = date("*t", ts / 1e9) 56 | month = tonumber(t.month) 57 | if current_month == -1 then current_month = month end 58 | end 59 | 60 | if day_advanced then 61 | local delta = month - current_month 62 | if delta > 0 then 63 | clear_months(current_month, delta) 64 | current_month = month 65 | elseif delta < 0 then -- roll over the year 66 | clear_months(current_month, MONTHS + delta) 67 | current_month = month 68 | end 69 | end 70 | months[month] = months[month] + uptime 71 | end 72 | 73 | ---- 74 | 75 | function process_message() 76 | local hours = read_message("Fields[hours]") 77 | if type(hours) ~= "number" then 78 | return -1, "missing/invalid hours" 79 | end 80 | if hours == 0 then return 0 end 81 | 82 | local ts = read_message("Timestamp") 83 | local day = floor(ts / (SEC_IN_DAY * 1e9)) 84 | local day_changed = day ~= current_day 85 | local day_advanced = false 86 | if day > current_day then 87 | current_day = day 88 | day_advanced = true 89 | elseif current_day - day > 360 * SEC_IN_DAY then 90 | return -1, "data is too old" 91 | end 92 | 93 | day_cb:add(ts, 1, hours) 94 | update_month(ts, hours, day_changed, day_advanced) 95 | 96 | return 0 97 | end 98 | 99 | local title = "Firefox Daily Active Hours" 100 | function timer_event(ns) 101 | inject_payload("cbuf", title, day_cb:format("cbuf")) 102 | inject_payload("cbufd", title, day_cb:format("cbufd")) 103 | 104 | local json = {} 105 | local idx = current_month 106 | if idx == -1 then idx = 0 end 107 | 108 | for i=1,MONTHS do 109 | idx = idx + 1 110 | if idx > MONTHS then idx = 1 end 111 | json[i] = {[month_names[idx]] = months[idx]} 112 | end 113 | inject_payload("json", "Firefox Monthly Active Hours", cjson.encode(json)) 114 | end 115 | -------------------------------------------------------------------------------- /heka/sandbox/filters/payload_size.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Extract submission sizes and counts for pipeline messages, emitting small 7 | derived messages for reporting. 8 | 9 | *Example Heka Configuration* 10 | 11 | .. code-block:: ini 12 | 13 | [PayloadSize] 14 | type = "SandboxFilter" 15 | filename = "lua_filters/payload_size.lua" 16 | message_matcher = "Type == 'telemetry' && Logger == 'telemetry'" 17 | ticker_interval = 0 18 | preserve_data = false 19 | 20 | --]] 21 | 22 | local msg = { 23 | Timestamp = nil, 24 | Type = "payload_size", 25 | Payload = nil, 26 | Fields = { 27 | build = "", 28 | channel = "", 29 | docType = "", 30 | size = 0, 31 | submissionDate = "", 32 | } 33 | } 34 | 35 | function process_message() 36 | msg.Timestamp = read_message("Timestamp") 37 | msg.Fields.build = read_message("Fields[appBuildId]") 38 | msg.Fields.channel = read_message("Fields[appUpdateChannel]") 39 | msg.Fields.docType = read_message("Fields[docType]") 40 | msg.Fields.size = read_message("Fields[Size]") 41 | 42 | -- This could be computed from msg.Timestamp, but we need the field for 43 | -- partitioning the data in the S3 Output. 44 | msg.Fields.submissionDate = read_message("Fields[submissionDate]") 45 | 46 | inject_message(msg) 47 | return 0 48 | end 49 | 50 | function timer_event(ns) 51 | 52 | end 53 | -------------------------------------------------------------------------------- /heka/sandbox/filters/telemetry_decoder_view.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Creates a summary view of the TelemetryDecoder Statistics. 7 | 8 | [TelemetryStats] 9 | type = "SandboxFilter" 10 | message_matcher = "Type == 'telemetry' || Type == 'heka.all-report'" 11 | filename = "lua_filters/telemetry_decoder_view.lua" 12 | memory_limit = 120000000 13 | output_limit = 256000 14 | ticker_interval = 60 15 | preserve_data = true 16 | --]] 17 | 18 | require "bloom_filter" 19 | require "circular_buffer" 20 | require "cjson" 21 | require "string" 22 | local alert = require "alert" 23 | 24 | local SEC_PER_ROW = 60 25 | local ROWS = 2880 26 | 27 | local items = read_config("bloom_items") or 3*1e6 28 | local probability = read_config("bloom_probability") or 0.01 29 | local decoder_match = read_config("decoder_match") or "^TelemetryKafkaInput(%d+)" 30 | bf = bloom_filter.new(items, probability) 31 | cb = circular_buffer.new(ROWS, 3, SEC_PER_ROW, true) 32 | local TOTAL = cb:set_header(1, "Total") 33 | local FAILURES = cb:set_header(2, "Failures") 34 | local DUPLICATES = cb:set_header(3, "Duplicates") 35 | id_count = {} -- array of decoder ids and the last seen count 36 | id_failures = {} -- array of decoder ids and the last seen failure count 37 | 38 | local alert_throttle = read_config("alert_throttle") or 3600 39 | alert.set_throttle(alert_throttle * 1e9) 40 | 41 | -- multi-decoder cascade means that we may need to apply a scaling factor to 42 | -- get an accurate count 43 | local scaling_factor = read_config("scaling_factor") or 1 44 | 45 | local function update_delta(ts, col, id, parray, cur) 46 | local previous = parray[id] 47 | if previous then 48 | if type(cur) == "number" then 49 | if cur > previous then 50 | local delta = cur - previous 51 | parray[id] = cur 52 | cb:add(ts, col, delta) 53 | elseif cur < previous then -- system restart 54 | parray[id] = cur 55 | cb:add(ts, col, cur) 56 | end 57 | end 58 | else 59 | if type(cur) == "number" then 60 | parray[id] = cur 61 | cb:set(ts, col, 0/0) -- advance the buffer with a NaN entry 62 | end 63 | end 64 | end 65 | 66 | ---- 67 | 68 | function process_message () 69 | local typ = read_message("Type") 70 | local ts = read_message("Timestamp") 71 | 72 | if typ == "heka.all-report" then 73 | local ok, json = pcall(cjson.decode, read_message("Payload")) 74 | if not ok then return -1, json end 75 | 76 | local t = json.decoders 77 | if not t then 78 | return -1, "No Decoders found" 79 | end 80 | 81 | for i,v in ipairs(t) do 82 | if not v.Name then 83 | return -1, "Decoder is missing its name" 84 | end 85 | 86 | local id = string.match(v.Name, decoder_match) 87 | if id then 88 | id = tonumber(id) 89 | 90 | if type(v["ProcessMessageCount-TelemetryDecoder"]) == "table" then 91 | update_delta(ts, TOTAL, id, id_count, v["ProcessMessageCount-TelemetryDecoder"].value / scaling_factor) 92 | end 93 | 94 | if type(v["ProcessMessageFailures-TelemetryDecoder"]) == "table" then 95 | update_delta(ts, FAILURES, id, id_failures, v["ProcessMessageFailures-TelemetryDecoder"].value) 96 | end 97 | end 98 | end 99 | elseif typ == "telemetry" then 100 | local did = read_message("Fields[documentId]") 101 | if not did then 102 | return -1, "No documentId" 103 | end 104 | 105 | local added = bf:add(did) 106 | if not added then 107 | cb:add(ts, DUPLICATES, 1) 108 | end 109 | end 110 | 111 | return 0 112 | end 113 | 114 | last_cleared = nil 115 | 116 | local title = "Telemetry Decoder Statistics" 117 | function timer_event(ns) 118 | if last_cleared and ns - last_cleared >= 1e9 * ROWS * SEC_PER_ROW then 119 | bf:clear() 120 | last_cleared = ns 121 | elseif not last_cleared then 122 | last_cleared = ns 123 | end 124 | 125 | if not cb:get(ns, 1) then 126 | cb:add(ns, 1, 0/0) -- always advance the buffer/graph using a NaN value 127 | end 128 | 129 | local sum, samples = cb:compute("sum", 1, cb:current_time() - (SEC_PER_ROW * 1e9)) 130 | if samples == 0 then 131 | alert.send(ns, "no new data") 132 | end 133 | inject_payload("cbuf", title, cb:format("cbuf")) 134 | inject_payload("cbufd", title, cb:format("cbufd")) 135 | end 136 | -------------------------------------------------------------------------------- /heka/sandbox/filters/telemetry_requests.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | _PRESERVATION_VERSION = 1 5 | 6 | require "circular_buffer" 7 | 8 | local title = "Telemetry Requests" 9 | local rows = read_config("rows") or 14400 10 | local sec_per_row = read_config("sec_per_row") or 60 11 | 12 | cbuf = circular_buffer.new(rows, 1, sec_per_row, true) 13 | cbuf:set_header(1, "Requests") 14 | 15 | function process_message () 16 | cbuf:add(read_message("Timestamp"), 1, 1) 17 | return 0 18 | end 19 | 20 | function timer_event(ns) 21 | inject_payload("cbuf", title, cbuf:format("cbuf")) 22 | inject_payload("cbufd", title, cbuf:format("cbufd")) 23 | end 24 | -------------------------------------------------------------------------------- /heka/sandbox/filters/telemetry_s3output_monitors.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Monitors ProcessFileFailures and ProcessMessageCount in the S3 outputs 7 | 8 | Config: 9 | 10 | *Example Heka Configuration* 11 | 12 | .. code-block:: ini 13 | 14 | [TelemetryS3OutputMonitors] 15 | type = "SandboxFilter" 16 | filename = "lua_filters/telemetry_s3output_monitors.lua" 17 | ticker_interval = 60 18 | preserve_data = false # should always be reset on Heka restarts 19 | message_matcher = "Type == 'heka.all-report'" 20 | [TelemetryS3OutputMonitors.config] 21 | # CSV to ignore low volume streams 22 | ignore_stalls = "TelemetryErrorsOutput,TelemetryLoopOutput" 23 | --]] 24 | 25 | require "cjson" 26 | require "string" 27 | local alert = require "alert" 28 | local l = require "lpeg" 29 | 30 | local sep = l.P(",") 31 | local elem = l.C((1 - sep)^1) 32 | local item = elem / l.P 33 | local list = item * ("," * item)^0 34 | local function add (a, b) return a + b end 35 | local grammar = l.Cf(list, add) 36 | grammar = grammar:match(read_config("ignore_stalls") or "TelemetryErrorsOutput") 37 | 38 | local plugins = {} 39 | 40 | local function find_plugin(name, ts) 41 | local p = plugins[name] 42 | if not p then 43 | p = {last_alert = 0, last_pff = 0, last_pmc = 0, last_update = ts} 44 | plugins[name] = p 45 | end 46 | return p 47 | end 48 | 49 | function process_message () 50 | local ok, json = pcall(cjson.decode, read_message("Payload")) 51 | if not ok then return -1, json end 52 | if type(json.outputs) ~= "table" then return -1, "missing outputs array" end 53 | 54 | local ts = read_message("Timestamp") 55 | 56 | for i,v in ipairs(json.outputs) do 57 | if type(v) ~= "table" then return -1, "invalid output object" end 58 | if type(v.ProcessFileFailures) == "table" then -- confirm this plugin has the S3 instrumentation 59 | if not v.Name then return -1, "missing plugin Name" end 60 | 61 | local p = find_plugin(v.Name, ts) 62 | local n = v.ProcessFileFailures.value 63 | if type(n) == "number" and n > p.last_pff then 64 | p.msg = string.format("%s ProcessFileFailures has increased to %d", v.Name, n) 65 | p.last_pff = n 66 | end 67 | 68 | if not grammar:match(v.Name) then 69 | n = v.ProcessMessageCount.value 70 | if type(n) == "number" then 71 | if n == p.last_pmc then 72 | if p.last_update + 60 * 1e9 < ts then 73 | p.msg = string.format("%s ProcessMessageCount has stalled at %d", v.Name, n) 74 | end 75 | else 76 | if ts >= p.last_update then 77 | p.last_update = ts 78 | p.last_pmc = n 79 | end 80 | end 81 | end 82 | end 83 | end 84 | end 85 | return 0 86 | end 87 | 88 | function timer_event(ns) 89 | for k,v in pairs(plugins) do 90 | if v.msg then 91 | if ns - v.last_alert > 60 * 60 * 1e9 then -- manual throttling (one alert per plugin per hour) 92 | alert.queue(0, v.msg) 93 | v.last_alert = ns 94 | end 95 | end 96 | v.msg = nil 97 | end 98 | alert.send_queue(0) 99 | end 100 | -------------------------------------------------------------------------------- /heka/sandbox/filters/telemetry_webrtc.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Derived stream for webrtc. https://bugzilla.mozilla.org/show_bug.cgi?id=1231410 7 | 8 | *Example Heka Configuration* 9 | 10 | .. code-block:: ini 11 | 12 | [TelemetryWebRTC] 13 | type = "SandboxFilter" 14 | filename = "lua_filters/telemetry_webrtc.lua" 15 | message_matcher = "Type == 'telemetry' && Logger == 'telemetry'" 16 | ticker_interval = 0 17 | preserve_data = false 18 | 19 | --]] 20 | 21 | require 'cjson' 22 | 23 | local function check_payload (payload) 24 | if type(payload) ~= "table" then return false end 25 | local w = payload["webrtc"] or {} 26 | local i = w["IceCandidatesStats"] or {} 27 | if next(i["webrtc"] or {}) or next(i["loop"] or {}) then 28 | return true 29 | end 30 | return false 31 | end 32 | 33 | function process_message() 34 | local ok, json = pcall(cjson.decode, read_message("Payload")) 35 | if not ok then return -1, json end 36 | local p = json["payload"] or {} 37 | local found = check_payload(p) 38 | if not found then 39 | -- check child payloads for E10s 40 | local children = read_message("Fields[payload.childPayloads]") 41 | if not children then return 0 end 42 | ok, json = pcall(cjson.decode, children) 43 | if not ok then return -1, children end 44 | if type(json) ~= "table" then return -1 end 45 | for i, child in ipairs(json) do 46 | found = check_payload(child) 47 | if found then break end 48 | end 49 | end 50 | 51 | if found then 52 | local raw = read_message("raw") 53 | inject_message(raw) 54 | end 55 | return 0 56 | end 57 | 58 | function timer_event(ns) 59 | -- no op 60 | end 61 | -------------------------------------------------------------------------------- /hindsight/analysis/landfill_error.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Simple debug tool to track the types of error in landfill processing. Used 7 | when tuning the validation schemas. 8 | 9 | Config: 10 | 11 | filename = "landfill_errors.lua" 12 | message_matcher = "Type == 'telemetry.error'" 13 | --]] 14 | 15 | require "string" 16 | 17 | local err_msgs = {} 18 | 19 | function process_message() 20 | local de = read_message("Fields[DecodeError]") or "" 21 | local cnt = err_msgs[de] 22 | if cnt then 23 | err_msgs[de] = cnt + 1 24 | else 25 | err_msgs[de] = 1 26 | end 27 | return 0 28 | end 29 | 30 | function timer_event(ns, shutdown) 31 | for k,v in pairs(err_msgs) do 32 | add_to_payload(v, "\t", k, "\n") 33 | end 34 | inject_payload("tsv", "error") 35 | end 36 | -------------------------------------------------------------------------------- /hindsight/input/heka_s3.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | -- This Source Code Form is subject to the terms of the Mozilla Public 6 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 7 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 8 | 9 | --[[ 10 | ## Reader for the S3 Heka files in compressed or uncompressed form 11 | 12 | Retrieves/reads each file from the `s3_file_list`. The primary use of this 13 | plugin is to feed the transformed/validated data into analysis plugins. 14 | 15 | ### Sample Configuration 16 | ```lua 17 | filename = "heka_s3.lua" 18 | s3_bucket = "net-mozaws-prod-us-west-2-pipeline-data" 19 | s3_file_list = "files.ls.1" 20 | tmp_dir = "/mnt/work/tmp" 21 | ``` 22 | --]] 23 | 24 | require "io" 25 | require "heka_stream_reader" 26 | require "os" 27 | require "string" 28 | 29 | local tmp_dir = read_config("tmp_dir") 30 | local s3_bucket = read_config("s3_bucket") or error("s3_bucket must be set") 31 | local logger = read_config("Logger") 32 | local s3_file_list = assert(io.open(read_config("s3_file_list"))) 33 | 34 | 35 | local function process_file(hsr, fn) 36 | local fh, err = io.open(fn) 37 | if not fh then 38 | print("failed to open", fn) 39 | return 40 | end 41 | 42 | local found, consumed, read 43 | repeat 44 | repeat 45 | found, consumed, read = hsr:find_message(fh) 46 | if found then 47 | inject_message(hsr) 48 | end 49 | until not found 50 | until read == 0 51 | fh:close() 52 | end 53 | 54 | 55 | local function execute_cmd(cmd, retries) 56 | local rv = 1 57 | for i=1, retries do 58 | rv = os.execute(cmd) 59 | if rv == 0 then 60 | break 61 | end 62 | end 63 | return rv 64 | end 65 | 66 | 67 | function process_message() 68 | local hsr = heka_stream_reader.new("s3") 69 | 70 | for fn in s3_file_list:lines() do 71 | local cmd 72 | local tfn = string.format("%s/%s", tmp_dir, logger) 73 | local ext = fn:match("%.([^.]-)$") 74 | if ext == "zst" then 75 | cmd = string.format("aws s3 cp s3://%s/%s - | zstd -d -c - > %s", s3_bucket, fn, tfn) 76 | elseif ext == "gz" then 77 | cmd = string.format("aws s3 cp s3://%s/%s - | gzip -d -c - > %s", s3_bucket, fn, tfn) 78 | else 79 | cmd = string.format("aws s3 cp s3://%s/%s %s", s3_bucket, fn, tfn) 80 | end 81 | 82 | print("processing", cmd) 83 | local rv = execute_cmd(cmd, 3) 84 | if rv == 0 then 85 | process_file(hsr, tfn, compression) 86 | else 87 | print("failed to execute rv:", rv, " cmd:", cmd) 88 | end 89 | end 90 | return 0 91 | end 92 | -------------------------------------------------------------------------------- /hindsight/input/telemetry_s3_snappy.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | -- This Source Code Form is subject to the terms of the Mozilla Public 6 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 7 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 8 | 9 | --[[ 10 | ## Reader for the S3 telemetry files that are Heka framed, snappy encoded 11 | messsages 12 | 13 | Retrieves/reads each file from the `s3_file_list`. The primary use of this 14 | plugin is to feed the transformed/validated data into analysis plugins. Once 15 | the snappy ugliness is removed (Bugzilla #1250218) the generalized 'heka_s3.lua' 16 | input can be used instead. 17 | 18 | 19 | ### Sample Configuration 20 | ```lua 21 | filename = "telemetry_s3_snappy.lua" 22 | s3_bucket = "net-mozaws-prod-us-west-2-pipeline-data" 23 | s3_file_list = "telemetry_dims.ls.1" 24 | tmp_dir = "/mnt/work/tmp" 25 | ``` 26 | --]] 27 | 28 | require "io" 29 | require "heka_stream_reader" 30 | require "os" 31 | require "snappy" 32 | require "string" 33 | 34 | local tmp_dir = read_config("tmp_dir") 35 | local s3_bucket = read_config("s3_bucket") or error("s3_bucket must be set") 36 | local logger = read_config("Logger") 37 | local s3_file_list = assert(io.open(read_config("s3_file_list"))) 38 | 39 | 40 | local function snappy_decode(msgbytes) 41 | local ok, uc = pcall(snappy.uncompress, msgbytes) 42 | if ok then 43 | return uc 44 | end 45 | return msgbytes 46 | end 47 | 48 | 49 | local function process_snappy_ugliness(hsr, dhsr, fh) 50 | local found, consumed, read 51 | repeat 52 | repeat 53 | found, consumed, read = hsr:find_message(fh, false) -- don't protobuf decode 54 | if found then 55 | local pbm = snappy_decode(hsr:read_message("raw")) 56 | local ok = pcall(dhsr.decode_message, dhsr, pbm) 57 | if ok then 58 | inject_message(dhsr) 59 | end 60 | end 61 | until not found 62 | until read == 0 63 | end 64 | 65 | 66 | local function process_file(hsr, fh) 67 | local found, consumed, read 68 | repeat 69 | repeat 70 | found, consumed, read = hsr:find_message(fh) 71 | if found then 72 | inject_message(hsr) 73 | end 74 | until not found 75 | until read == 0 76 | end 77 | 78 | 79 | local function execute_cmd(cmd, retries) 80 | local rv = 1 81 | for i=1, retries do 82 | rv = os.execute(cmd) 83 | if rv == 0 then 84 | break 85 | end 86 | end 87 | return rv 88 | end 89 | 90 | 91 | function process_message() 92 | local hsr = heka_stream_reader.new("s3") 93 | local dhsr = heka_stream_reader.new("snappy") 94 | 95 | for fn in s3_file_list:lines() do 96 | local cmd 97 | local tfn = string.format("%s/%s", tmp_dir, logger) 98 | local ext = fn:match("%.([^.]-)$") 99 | if ext == "zst" then 100 | cmd = string.format("aws s3 cp s3://%s/%s - | zstd -d -c - > %s", s3_bucket, fn, tfn) 101 | elseif ext == "gz" then 102 | cmd = string.format("aws s3 cp s3://%s/%s - | gzip -d -c - > %s", s3_bucket, fn, tfn) 103 | else 104 | ext = nil 105 | cmd = string.format("aws s3 cp s3://%s/%s %s", s3_bucket, fn, tfn) 106 | end 107 | 108 | print("processing", cmd) 109 | local rv = execute_cmd(cmd, 3) 110 | if rv == 0 then 111 | local fh, err = io.open(tfn) 112 | if not fh then 113 | print("failed to open", tfn) 114 | return 0 115 | end 116 | if ext then 117 | process_file(hsr, fh) 118 | else 119 | process_snappy_ugliness(hsr, dhsr, fh) 120 | end 121 | fh:close() 122 | else 123 | print("failed to execute rv:", rv, " cmd:", cmd) 124 | end 125 | end 126 | return 0 127 | end 128 | -------------------------------------------------------------------------------- /hindsight/io_modules/derived_stream/heka_protobuf.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | local M = {} 6 | local ipairs = ipairs 7 | local type = type 8 | 9 | local read_message = read_message 10 | local encode_message = encode_message 11 | 12 | local match = require "string".match 13 | 14 | setfenv(1, M) -- Remove external access to contain everything in the module 15 | 16 | function write_message(fh, msg, schema) 17 | for i,v in ipairs(schema) do 18 | local value 19 | if type(v[5]) == "function" then 20 | value = v[5]() 21 | elseif type(v[5]) == "string" then 22 | value = read_message(v[5]) 23 | end 24 | 25 | if value ~= nil then 26 | if v[1] == "Uuid" then 27 | msg.Uuid = value 28 | elseif v[1] == "Timestamp" then 29 | msg.Timestamp = value 30 | elseif v[1] == "Type" then 31 | msg.Type = value 32 | elseif v[1] == "Logger" then 33 | msg.Logger = value 34 | elseif v[1] == "Severity" then 35 | msg.Severity = value 36 | elseif v[1] == "EnvVersion" then 37 | msg.EnvVersion = value 38 | elseif v[1] == "Pid" then 39 | msg.Pid = value 40 | elseif v[1] == "Hostname" then 41 | msg.Hostname = value 42 | else 43 | if type(value) == "number" and match(v[2], "INT") then 44 | msg.Fields[v[1]] = {value = value, value_type = 2} 45 | else 46 | msg.Fields[v[1]] = value 47 | end 48 | end 49 | end 50 | end 51 | fh:write(encode_message(msg, true)) 52 | end 53 | 54 | return M 55 | -------------------------------------------------------------------------------- /hindsight/io_modules/derived_stream/redshift.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | local M = {} 6 | local tostring = tostring 7 | local type = type 8 | 9 | local date = require "os".date 10 | local floor = require "math".floor 11 | local gsub = require "string".gsub 12 | 13 | setfenv(1, M) -- Remove external access to contain everything in the module 14 | 15 | VARCHAR_MAX_LENGTH = 65535 16 | 17 | function strip_nonprint(v) 18 | -- A CHAR column can only contain single-byte characters 19 | -- http://docs.aws.amazon.com/redshift/latest/dg/r_Character_types.html 20 | -- for our use restrict it to printable chars 21 | if v == nil then return end 22 | if type(v) ~= "string" then v = tostring(v) end 23 | return gsub(v, "[^\032-\126]", "?") 24 | end 25 | 26 | function esc_timestamp(v, default) 27 | if type(v) ~= "number" or v > 4294967296e9 or v < 0 then 28 | return default 29 | end 30 | return date("%Y-%m-%d %H:%M:%S.", floor(v / 1e9)) .. tostring(floor(v % 1e9 / 1e3)) 31 | end 32 | 33 | function esc_smallint(v, default) 34 | if type(v) ~= "number" or v > 32767 or v < -32767 then 35 | return default 36 | end 37 | return tostring(floor(v)) 38 | end 39 | 40 | function esc_integer(v, default) 41 | if type(v) ~= "number" or v > 2147483647 or v < -2147483647 then 42 | return default 43 | end 44 | return tostring(floor(v)) 45 | end 46 | 47 | function esc_bigint(v, default) 48 | if type(v) ~= "number" then return default end 49 | return tostring(floor(v)) 50 | end 51 | 52 | function esc_double(v, default) 53 | if type(v) ~= "number"then return default end 54 | if v ~= v then return "NaN" end 55 | if v == 1/0 then return "Infinity" end 56 | if v == -1/0 then return "-Infinity" end 57 | return tostring(v) 58 | end 59 | 60 | function esc_boolean(v, default) 61 | if type(v) ~= "boolean" then return default end 62 | if v then return "TRUE" end 63 | return "FALSE" 64 | end 65 | 66 | return M 67 | -------------------------------------------------------------------------------- /hindsight/io_modules/derived_stream/redshift/psv.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | local M = {} 6 | local ipairs = ipairs 7 | local read_message = read_message 8 | local tostring = tostring 9 | local type = type 10 | 11 | local rs = require "derived_stream.redshift" 12 | local string = require "string" 13 | 14 | setfenv(1, M) -- Remove external access to contain everything in the module 15 | 16 | local esc_chars = { ["|"] = "\\|", ["\r"] = "\\r", ["\n"] = "\\n", ["\\"] = "\\\\" } 17 | function esc_varchar(v, max) 18 | if v == nil then return "" end 19 | if max == nil then max = rs.VARCHAR_MAX_LENGTH end 20 | if type(v) ~= "string" then v = tostring(v) end 21 | if string.len(v) > max then v = string.sub(v, 1, max) end 22 | local s, e = string.find(v, "%z") 23 | if s then v = string.sub(v, 1, s-1) end 24 | return string.gsub(v, "[|\r\n\\]", esc_chars) 25 | end 26 | 27 | function write_message(fh, schema) 28 | for i,v in ipairs(schema) do 29 | local value 30 | if type(v[5]) == "function" then 31 | value = v[5]() 32 | elseif type(v[5]) == "string" then 33 | value = read_message(v[5]) 34 | end 35 | 36 | if v[2] == "TIMESTAMP" then 37 | value = rs.esc_timestamp(value, "") 38 | elseif v[2] == "SMALLINT" then 39 | value = rs.esc_smallint(value, "") 40 | elseif v[2] == "INTEGER" then 41 | value = rs.esc_integer(value, "") 42 | elseif v[2] == "BIGINT" then 43 | value = rs.esc_bigint(value, "") 44 | elseif v[2] == "DOUBLE PRECISION" or v[2] == "REAL" or v[2] == "DECIMAL" then 45 | value = rs.esc_double(value, "") 46 | elseif v[2] == "BOOLEAN" then 47 | value = rs.esc_boolean(value, "") 48 | elseif v[2] == "CHAR" then 49 | value = esc_varchar(rs.strip_nonprint(value), v[3]) 50 | elseif v[2] == "VARCHAR" or v[2] == "DATE" then 51 | value = esc_varchar(value, v[3]) 52 | else 53 | error("Invaild Redshift data type (aliases are not allowed): " .. tostring(v[2])) 54 | end 55 | 56 | if i > 1 then 57 | fh:write("|", value) 58 | else 59 | fh:write(value) 60 | end 61 | end 62 | fh:write("\n") 63 | end 64 | 65 | return M 66 | -------------------------------------------------------------------------------- /hindsight/io_modules/derived_stream/redshift/sql.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | local M = {} 6 | local error = error 7 | local ipairs = ipairs 8 | local read_message = read_message 9 | local tostring = tostring 10 | local type = type 11 | 12 | local rs = require "derived_stream.redshift" 13 | local string = require "string" 14 | local table = require "table" 15 | 16 | setfenv(1, M) -- Remove external access to contain everything in the module 17 | 18 | function get_create_table_sql(name, schema) 19 | local pieces = {"CREATE TABLE IF NOT EXISTS ", name, " ("} 20 | for i, c in ipairs(schema) do 21 | if i > 1 then 22 | table.insert(pieces, ",") 23 | end 24 | table.insert(pieces, string.format("%s %s", c[1], c[2])) 25 | if c[3] ~= nil then 26 | table.insert(pieces, string.format("(%s)", c[3])) 27 | end 28 | if c[4] then 29 | table.insert(pieces, " " .. c[4]) 30 | end 31 | end 32 | table.insert(pieces, ")") 33 | return table.concat(pieces) 34 | end 35 | 36 | function esc_timestamp(v) 37 | local ts = rs.esc_timestamp(v) 38 | if not ts then return "NULL" end 39 | return string.format("'%s'", ts) 40 | end 41 | 42 | function esc_varchar(con, v, max) 43 | if v == nil then return "NULL" end 44 | if max == nil then max = rs.VARCHAR_MAX_LENGTH end 45 | if type(v) ~= "string" then v = tostring(v) end 46 | if string.len(v) > max then v = string.sub(v, 1, max) end 47 | 48 | local escd = con:escape(v) 49 | if not escd then return "NULL" end 50 | return string.format("'%s'", escd) 51 | end 52 | 53 | function write_message(fh, schema, con) 54 | fh:write("(") 55 | for i,v in ipairs(schema) do 56 | local value = "NULL" 57 | if type(v[5]) == "function" then 58 | value = v[5]() 59 | elseif type(v[5]) == "string" then 60 | value = read_message(v[5]) 61 | end 62 | 63 | if v[2] == "TIMESTAMP" then 64 | value = esc_timestamp(value) 65 | elseif v[2] == "SMALLINT" then 66 | value = rs.esc_smallint(value, "NULL") 67 | elseif v[2] == "INTEGER" then 68 | value = rs.esc_integer(value, "NULL") 69 | elseif v[2] == "BIGINT" then 70 | value = rs.esc_bigint(value, "NULL") 71 | elseif v[2] == "DOUBLE PRECISION" or v[2] == "REAL" or v[2] == "DECIMAL" then 72 | value = rs.esc_double(value, "NULL") 73 | elseif v[2] == "BOOLEAN" then 74 | value = rs.esc_boolean(value, "NULL") 75 | elseif v[2] == "CHAR" then 76 | value = esc_varchar(con, rs.strip_nonprint(value), v[3]) 77 | elseif v[2] == "VARCHAR" or v[2] == "DATE" then 78 | value = esc_varchar(con, value, v[3]) 79 | else 80 | error("Invaild Redshift data type (aliases are not allowed): " .. tostring(v[2])) 81 | end 82 | 83 | if i > 1 then 84 | fh:write(",", value) 85 | else 86 | fh:write(value) 87 | end 88 | end 89 | fh:write(")") 90 | end 91 | 92 | return M 93 | -------------------------------------------------------------------------------- /hindsight/io_modules/derived_stream/tsv.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | local M = {} 6 | local ipairs = ipairs 7 | local tostring = tostring 8 | local type = type 9 | 10 | local read_message = read_message 11 | local encode_message = encode_message 12 | 13 | local gsub = require "string".gsub 14 | 15 | setfenv(1, M) -- Remove external access to contain everything in the module 16 | 17 | local esc_chars = { ["\t"] = "\\t", ["\r"] = "\\r", ["\n"] = "\\n", ["\\"] = "\\\\" } 18 | 19 | function esc_str(v) 20 | return gsub(v, "[\t\r\n\\]", esc_chars) 21 | end 22 | 23 | function write_message(fh, schema, nil_value) 24 | for i,v in ipairs(schema) do 25 | local value 26 | if type(v[5]) == "function" then 27 | value = v[5]() 28 | elseif type(v[5]) == "string" then 29 | value = read_message(v[5]) 30 | end 31 | if value == nil then 32 | value = nil_value 33 | else 34 | value = tostring(value) 35 | end 36 | 37 | if v[2] == "CHAR" or v[2] == "VARCHAR" then 38 | value = esc_str(value) 39 | end 40 | 41 | if i > 1 then 42 | fh:write("\t", value) 43 | else 44 | fh:write(value) 45 | end 46 | end 47 | fh:write("\n") 48 | end 49 | 50 | return M 51 | -------------------------------------------------------------------------------- /hindsight/modules/agg.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | local M = {} 6 | local type = type 7 | local pairs = pairs 8 | setfenv(1, M) -- Remove external access to contain everything in the module 9 | 10 | -- Merge two objects. Add all data from "src" to "dest". Numeric values are 11 | -- added, boolean and string values are overwritten, and arrays and objects are 12 | -- recursively merged. 13 | -- Any data with different types in dest and src will be skipped. 14 | -- Example: 15 | --local a = { 16 | -- foo = 1, 17 | -- bar = {1, 1, 3}, 18 | -- quux = 3 19 | --} 20 | --local b = { 21 | -- foo = 5, 22 | -- bar = {0, 0, 5, 1}, 23 | -- baz = { 24 | -- hello = 100 25 | -- } 26 | --} 27 | -- 28 | --local c = merge_objects(a, b) 29 | --------- 30 | -- c contains { 31 | -- foo = 5, 32 | -- bar = {1, 1, 8, 1}, 33 | -- baz = { 34 | -- hello = 100 35 | -- }, 36 | -- quux = 3 37 | --} 38 | function merge_objects(dest, src) 39 | if dest == nil then 40 | return src 41 | end 42 | if src == nil then 43 | return dest 44 | end 45 | 46 | local tdest = type(dest) 47 | local tsrc = type(src) 48 | 49 | -- Types are different. Ignore the src value, because src is wrong. 50 | if tdest ~= tsrc then 51 | return dest 52 | end 53 | 54 | -- types are the same, neither is nil. 55 | if tdest == "number" then 56 | return dest + src 57 | end 58 | 59 | -- most recent wins: 60 | if tdest == "boolean" or tdest == "string" then 61 | return src 62 | end 63 | 64 | if tdest == "table" then 65 | -- array or object, iterate by key 66 | for k,v in pairs(src) do 67 | dest[k] = merge_objects(dest[k], v) 68 | end 69 | return dest 70 | end 71 | 72 | -- How did we get here? 73 | --print("weird type: ", tdest, "\n") 74 | return dest 75 | end 76 | 77 | return M 78 | -------------------------------------------------------------------------------- /hindsight/output/cbuf2tsv.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Convert a circular buffer output to a TSV for non-Heka dashboard consumption 7 | 8 | Config: 9 | output_path = path to write the converted cbuf(s) to 10 | 11 | 12 | *Example Heka Configuration* 13 | 14 | .. code-block:: ini 15 | 16 | [CbufToDashboard] 17 | type = "SandboxOutput" 18 | filename = "cbuf_dashboard.lua" 19 | message_matcher = "Type == 'heka.sandbox-output' && Fields[payload_type] == 'cbuf'" # convert all cbufs 20 | ticker_interval = 60 21 | 22 | [CbufToDashboard.config] 23 | output_path = "/tmp" 24 | 25 | 26 | Input: 27 | {"time":1423440000,"rows":4,"columns":1,"seconds_per_row":1,"column_info":[{"name":"Active_Users","unit":"count","aggregation":"sum"}]} 28 | 33031 29 | 33526 30 | 40143 31 | 38518 32 | 33 | Output: 34 | Time (time_t) Active Users (count) 35 | 1423440000 33031 36 | 1423440001 33526 37 | 1423440002 40143 38 | 1423440003 38518 39 | 40 | --]] 41 | 42 | require "cjson" 43 | require "io" 44 | require "string" 45 | require "table" 46 | 47 | local output_path = assert(read_config("output_path"), "output_path must be specified") 48 | 49 | function process_message() 50 | local header 51 | local cb_time = 0 52 | local cb_spr = 0 53 | local cb_rows = 0 54 | local body = {} 55 | local cnt = 0 56 | 57 | local payload = read_message("Payload") 58 | for l in string.gmatch(payload, ".-\n") do 59 | if not header then 60 | if string.match(l, "^{") then 61 | local ok, json = pcall(cjson.decode, l) 62 | if not ok then return -1, json end 63 | 64 | if type(json.time) == "number" and 65 | type(json.rows) == "number" and 66 | type(json.seconds_per_row) == "number" and 67 | type(json.column_info) == "table" then 68 | cb_time = json.time 69 | cb_spr = json.seconds_per_row 70 | cb_rows = json.rows 71 | local names = {"Time (time_t)"} 72 | for i, v in ipairs(json.column_info) do 73 | local ok, col = pcall(string.format, "%s (%s)", v.name, v.unit) 74 | if not ok then return -1, "invalid column_info" end 75 | names[i + 1] = col 76 | end 77 | header = table.concat(names, "\t") 78 | end 79 | end 80 | else 81 | cnt = cnt + 1 82 | body[cnt] = string.format("%d\t%s", (cnt - 1) * cb_spr + cb_time, l) 83 | end 84 | end 85 | 86 | if not header then return -1, "malformed cbuf, no header" end 87 | 88 | if cnt < 3 or cnt ~= cb_rows then 89 | return -1, string.format("incorrect number of rows expected: %d, received: %d", cb_rows, cnt) 90 | end 91 | 92 | local logger = read_message("Logger") 93 | 94 | local name = read_message("Fields[payload_name]") or "" 95 | name = string.gsub(name, "%W", "") 96 | if string.len(name) > 64 then name = string.sub(name, 1, 64) end 97 | 98 | local fh = assert(io.open(string.format("%s/%s.%s.tsv", output_path, logger, name), "w")) 99 | fh:write(header, "\n", table.concat(body)) 100 | fh:close() 101 | return 0 102 | end 103 | 104 | function timer_event(ns) 105 | -- used to force GC 106 | end 107 | -------------------------------------------------------------------------------- /hindsight/output/crash_summary.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Outputs a crash ping summary derived stream in the specified format one table/file per day. 7 | 8 | Config: 9 | 10 | filename = "crash_summary.lua" 11 | message_matcher = "Type == 'telemetry' && Fields[docType] == 'crash'" 12 | 13 | format = "redshift.psv" 14 | buffer_path = "/mnt/output" 15 | buffer_size = 20 * 1024 * 1024 16 | s3_path = "s3://test" 17 | 18 | --]] 19 | 20 | local ds = require "derived_stream" 21 | local fx = require "fx" 22 | local ping = require "fx.ping" 23 | 24 | local name = "crash_summary" 25 | local schema = { 26 | -- column name type length attributes field /function 27 | {"Timestamp" ,"TIMESTAMP" ,nil ,"SORTKEY" ,"Timestamp"}, 28 | {"crashDate" ,"DATE" ,nil ,nil ,function () return ping.get_date(ping.payload().payload.crashDate) end}, 29 | {"clientId" ,"CHAR" ,36 ,"DISTKEY" ,"Fields[clientId]"}, 30 | {"buildVersion" ,"VARCHAR" ,32 ,nil ,function () return ping.build().version end}, 31 | {"buildId" ,"CHAR" ,14 ,nil ,function () return ping.build().buildId end}, 32 | {"buildArchitecture" ,"VARCHAR" ,32 ,nil ,function () return ping.build().architecture end}, 33 | {"channel" ,"VARCHAR" ,7 ,nil ,function () return fx.normalize_channel(read_message("Fields[appUpdateChannel]")) end}, 34 | {"os" ,"VARCHAR" ,7 ,nil ,function () return fx.normalize_os(read_message("Fields[os]")) end}, 35 | {"osVersion" ,"VARCHAR" ,32 ,nil ,function () return ping.system().os.version end}, 36 | {"osServicepackMajor" ,"VARCHAR" ,32 ,nil ,function () return ping.system().os.servicePackMajor end}, 37 | {"osServicepackMinor" ,"VARCHAR" ,32 ,nil ,function () return ping.system().os.servicePackMinor end}, 38 | {"locale" ,"VARCHAR" ,32 ,nil ,function () return ping.settings().locale end}, 39 | {"activeExperimentId" ,"VARCHAR" ,32 ,nil ,function () return ping.addons().activeExperiment.id end}, 40 | {"activeExperimentBranch" ,"VARCHAR" ,32 ,nil ,function () return ping.addons().activeExperiment.branch end}, 41 | {"country" ,"VARCHAR" ,5 ,nil ,function () return fx.normalize_country(read_message("Fields[geoCountry]")) end}, 42 | {"hasCrashEnvironment" ,"BOOLEAN" ,nil ,nil ,function () return ping.payload().payload.hasCrashEnvironment end}, 43 | } 44 | 45 | local ds_pm 46 | ds_pm, timer_event = ds.load_schema(name, schema) 47 | 48 | function process_message() 49 | ping.clear_cache() 50 | return ds_pm() 51 | end 52 | 53 | -------------------------------------------------------------------------------- /hindsight/output/executive_summary.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Outputs a executive summary based on the main and crash pings as a derived stream 7 | in the specified format one table/file per day. 8 | 9 | Config: 10 | 11 | filename = "executive_summary.lua" 12 | message_matcher = "Logger == 'fx' && Type == 'executive_summary'" 13 | 14 | format = "redshift.psv" 15 | buffer_path = "/mnt/output" 16 | buffer_size = 20 * 1024 * 1024 17 | s3_path = "s3://test" 18 | 19 | --]] 20 | 21 | local ds = require "derived_stream" 22 | local name = "executive_summary" 23 | local schema = { 24 | -- column name type length attributes field /function 25 | {"Timestamp" ,"TIMESTAMP" ,nil ,"SORTKEY" ,"Timestamp"}, 26 | {"activityTimestamp" ,"TIMESTAMP" ,nil ,nil ,"Fields[activityTimestamp]"}, 27 | {"profileCreationTimestamp" ,"TIMESTAMP" ,nil ,nil ,"Fields[profileCreationTimestamp]"}, 28 | {"buildId" ,"CHAR" ,14 ,nil ,"Fields[buildId]"}, 29 | {"clientId" ,"CHAR" ,36 ,"DISTKEY" ,"Fields[clientId]"}, 30 | {"documentId" ,"CHAR" ,36 ,nil ,"Fields[documentId]"}, 31 | {"docType" ,"CHAR" ,36 ,nil ,"Fields[docType]"}, 32 | {"country" ,"VARCHAR" ,5 ,nil ,"Fields[country]"}, 33 | {"channel" ,"VARCHAR" ,7 ,nil ,"Fields[channel]"}, 34 | {"os" ,"VARCHAR" ,7 ,nil ,"Fields[os]"}, 35 | {"osVersion" ,"VARCHAR" ,32 ,nil ,"Fields[osVersion]"}, 36 | {"app" ,"VARCHAR" ,32 ,nil ,"Fields[app]"}, 37 | {"version" ,"VARCHAR" ,32 ,nil ,"Fields[version]"}, 38 | {"vendor" ,"VARCHAR" ,32 ,nil ,"Fields[vendor]"}, 39 | {"reason" ,"VARCHAR" ,32 ,nil ,"Fields[reason]"}, 40 | {'"default"' ,"BOOLEAN" ,nil ,nil ,"Fields[default]"}, 41 | {"hours" ,"DOUBLE PRECISION" ,nil ,nil ,"Fields[hours]"}, 42 | {"google" ,"INTEGER" ,nil ,nil ,"Fields[google]"}, 43 | {"bing" ,"INTEGER" ,nil ,nil ,"Fields[bing]"}, 44 | {"yahoo" ,"INTEGER" ,nil ,nil ,"Fields[yahoo]"}, 45 | {"other" ,"INTEGER" ,nil ,nil ,"Fields[other]"}, 46 | {"city" ,"VARCHAR" ,32 ,nil ,"Fields[city]"}, 47 | } 48 | 49 | process_message, timer_event = ds.load_schema(name, schema) 50 | -------------------------------------------------------------------------------- /hindsight/output/executive_summary_full.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Outputs a executive summary based on the main and crash pings as a derived stream 7 | in the specified format one table/file per day. 8 | 9 | Config: 10 | 11 | filename = "executive_summary_full.lua" 12 | message_matcher = "Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 13 | 14 | format = "redshift.psv" 15 | buffer_path = "/mnt/output" 16 | buffer_size = 20 * 1024 * 1024 17 | s3_path = "s3://test" 18 | 19 | --]] 20 | 21 | local ds = require "derived_stream" 22 | local fx = require "fx" 23 | local ping = require "fx.ping" 24 | require "string" 25 | 26 | local doc_type 27 | local search_counts 28 | 29 | local function get_activity_timestamp() 30 | local ts 31 | if doc_type == "main" then 32 | ts = ping.get_timestamp(ping.info().subsessionStartDate) 33 | else 34 | ping.get_timestamp(ping.payload().payload.crashDate) 35 | end 36 | 37 | if not ts then 38 | ts = read_message("Fields[creationTimestamp]") 39 | end 40 | return ts 41 | end 42 | 43 | local function get_search_counts() 44 | local cnts = {0, 0, 0, 0} 45 | local sc = ping.khist().SEARCH_COUNTS 46 | if type(sc) ~= "table" then return cnts end 47 | 48 | for k, v in pairs(sc) do 49 | if type(v) == "table" then 50 | for i, e in ipairs({"[Gg]oogle", "[Bb]ing", "[Yy]ahoo", "."}) do 51 | if string.match(k, e) then 52 | if type(v.sum) == "number" then 53 | cnts[i] = cnts[i] + v.sum 54 | end 55 | break 56 | end 57 | end 58 | end 59 | end 60 | return cnts 61 | end 62 | 63 | local name = "executive_summary" 64 | local schema = { 65 | -- column name type length attributes field /function 66 | {"Timestamp" ,"TIMESTAMP" ,nil ,"SORTKEY" ,"Timestamp"}, 67 | {"activityTimestamp" ,"TIMESTAMP" ,nil ,nil ,get_activity_timestamp}, 68 | {"profileCreationTimestamp" ,"TIMESTAMP" ,nil ,nil ,ping.profile_creation_timestamp}, 69 | {"buildId" ,"CHAR" ,14 ,nil ,"Fields[appBuildId]"}, 70 | {"clientId" ,"CHAR" ,36 ,"DISTKEY" ,"Fields[clientId]"}, 71 | {"documentId" ,"CHAR" ,36 ,nil ,"Fields[documentId]"}, 72 | {"docType" ,"CHAR" ,36 ,nil ,function () return doc_type end}, 73 | {"country" ,"VARCHAR" ,5 ,nil ,function () return fx.normalize_country(read_message("Fields[geoCountry]")) end}, 74 | {"channel" ,"VARCHAR" ,7 ,nil ,function () return fx.normalize_channel(read_message("Fields[appUpdateChannel]")) end}, 75 | {"os" ,"VARCHAR" ,7 ,nil ,function () return fx.normalize_os(read_message("Fields[os]")) end}, 76 | {"osVersion" ,"VARCHAR" ,32 ,nil ,function () return ping.system().os.version end}, 77 | {"app" ,"VARCHAR" ,32 ,nil ,"Fields[appName]"}, 78 | {"version" ,"VARCHAR" ,32 ,nil ,"Fields[appVersion]"}, 79 | {"vendor" ,"VARCHAR" ,32 ,nil ,"Fields[appVendor]"}, 80 | {"reason" ,"VARCHAR" ,32 ,nil ,"Fields[reason]"}, 81 | {'"default"' ,"BOOLEAN" ,nil ,nil ,ping.is_default_browser}, 82 | {"hours" ,"DOUBLE PRECISION" ,nil ,nil ,ping.hours}, 83 | {"google" ,"INTEGER" ,nil ,nil ,function () return search_counts[1] end}, 84 | {"bing" ,"INTEGER" ,nil ,nil ,function () return search_counts[2] end}, 85 | {"yahoo" ,"INTEGER" ,nil ,nil ,function () return search_counts[3] end}, 86 | {"other" ,"INTEGER" ,nil ,nil ,function () return search_counts[4] end}, 87 | {"city" ,"VARCHAR" ,32 ,nil ,"Fields[geoCity]"}, 88 | } 89 | 90 | local ds_pm 91 | ds_pm, timer_event = ds.load_schema(name, schema) 92 | 93 | function process_message() 94 | ping.clear_cache() 95 | doc_type = read_message("Fields[docType]") 96 | search_counts = get_search_counts() 97 | return ds_pm() 98 | end 99 | 100 | -------------------------------------------------------------------------------- /hindsight/output/main_summary.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Outputs a main ping summary derived stream in the specified format one table/file per day. 7 | 8 | Config: 9 | 10 | filename = "main_summary.lua" 11 | message_matcher = "Type == 'telemetry' && Fields[docType] == 'main'" 12 | 13 | format = "redshift.psv" 14 | buffer_path = "/mnt/output" 15 | buffer_size = 100 * 1024 * 1024 16 | s3_path = "s3://test" 17 | 18 | --]] 19 | 20 | local ds = require "derived_stream" 21 | local fx = require "fx" 22 | local ping = require "fx.ping" 23 | 24 | local name = "main_summary" 25 | local schema = { 26 | -- column name type length attributes field /function 27 | {"Timestamp" ,"TIMESTAMP" ,nil ,"SORTKEY" ,"Timestamp"}, 28 | {"subsessionDate" ,"DATE" ,nil ,nil ,function () return ping.get_date(ping.info().subsessionStartDate) end}, 29 | {"clientId" ,"CHAR" ,36 ,"DISTKEY" ,"Fields[clientId]"}, 30 | {"buildVersion" ,"VARCHAR" ,32 ,nil ,function () return ping.build().version end}, 31 | {"buildId" ,"CHAR" ,14 ,nil ,function () return ping.build().buildId end}, 32 | {"buildArchitecture" ,"VARCHAR" ,32 ,nil ,function () return ping.build().architecture end}, 33 | {"channel" ,"VARCHAR" ,7 ,nil ,function () return fx.normalize_channel(read_message("Fields[appUpdateChannel]")) end}, 34 | {"os" ,"VARCHAR" ,7 ,nil ,function () return fx.normalize_os(read_message("Fields[os]")) end}, 35 | {"osVersion" ,"VARCHAR" ,32 ,nil ,function () return ping.system().os.version end}, 36 | {"osServicepackMajor" ,"VARCHAR" ,32 ,nil ,function () return ping.system().os.servicePackMajor end}, 37 | {"osServicepackMinor" ,"VARCHAR" ,32 ,nil ,function () return ping.system().os.servicePackMinor end}, 38 | {"locale" ,"VARCHAR" ,32 ,nil ,function () return ping.settings().locale end}, 39 | {"activeExperimentId" ,"VARCHAR" ,32 ,nil ,function () return ping.addons().activeExperiment.id end}, 40 | {"activeExperimentBranch" ,"VARCHAR" ,32 ,nil ,function () return ping.addons().activeExperiment.branch end}, 41 | {"country" ,"VARCHAR" ,5 ,nil ,function () return fx.normalize_country(read_message("Fields[geoCountry]")) end}, 42 | {"reason" ,"VARCHAR" ,32 ,nil ,function () return ping.info().reason end}, 43 | {"subsessionLength" ,"INTEGER" ,nil ,nil ,function () return ping.info().subsessionLength end}, 44 | {"timezoneOffset" ,"INTEGER" ,nil ,nil ,function () return ping.info().timezoneOffset end}, 45 | {"pluginHangs" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("SUBPROCESS_CRASHES_WITH_DUMP", "pluginhang") end}, 46 | {"abortsPlugin" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("SUBPROCESS_ABNORMAL_ABORT", "plugin") end}, 47 | {"abortsContent" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("SUBPROCESS_ABNORMAL_ABORT", "content") end}, 48 | {"abortsGmplugin" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("SUBPROCESS_ABNORMAL_ABORT", "gmplugin") end}, 49 | {"crashesdetectedPlugin" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("SUBPROCESS_CRASHES_WITH_DUMP", "plugin") end}, 50 | {"crashesdetectedContent" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("SUBPROCESS_CRASHES_WITH_DUMP", "content") end}, 51 | {"crashesdetectedGmplugin" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("SUBPROCESS_CRASHES_WITH_DUMP", "gmplugin") end}, 52 | {"crashSubmitAttemptMain" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("PROCESS_CRASH_SUBMIT_ATTEMPT", "main-crash") end}, 53 | {"crashSubmitAttemptContent" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("PROCESS_CRASH_SUBMIT_ATTEMPT", "content-crash") end}, 54 | {"crashSubmitAttemptPlugin" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("PROCESS_CRASH_SUBMIT_ATTEMPT", "plugin-crash") end}, 55 | {"crashSubmitSuccessMain" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("PROCESS_CRASH_SUBMIT_SUCCESS", "main-crash") end}, 56 | {"crashSubmitSuccessContent" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("PROCESS_CRASH_SUBMIT_SUCCESS", "content-crash") end}, 57 | {"crashSubmitSuccessPlugin" ,"INTEGER" ,nil ,nil ,function () return ping.khist_sum("PROCESS_CRASH_SUBMIT_SUCCESS", "plugin-crash") end}, 58 | {"activeAddons" ,"INTEGER" ,nil ,nil ,function () return ping.num_active_addons() end}, 59 | {"flashVersion" ,"VARCHAR" ,16 ,nil ,function () return ping.flash_version() end}, 60 | } 61 | 62 | local ds_pm 63 | ds_pm, timer_event = ds.load_schema(name, schema) 64 | 65 | function process_message() 66 | ping.clear_cache() 67 | return ds_pm() 68 | end 69 | 70 | -------------------------------------------------------------------------------- /reports/budget/budget.toml: -------------------------------------------------------------------------------- 1 | [hekad] 2 | maxprocs = 8 3 | base_dir = "/mnt/telemetry/output" 4 | share_dir = "/mnt/telemetry/heka/share/heka" 5 | # 8MB 6 | max_message_size = 8388608 7 | 8 | [SnappyDecoder] 9 | 10 | [Multi] 11 | type = "MultiDecoder" 12 | subs = ["SnappyDecoder", "ProtobufDecoder"] 13 | cascade_strategy = "all" 14 | log_sub_errors = true 15 | 16 | [S3Input] 17 | type = "S3SplitFileInput" 18 | s3_bucket = "net-mozaws-prod-us-west-2-pipeline-data" 19 | s3_bucket_prefix = "telemetry-payload-size" 20 | s3_worker_count = 16 21 | s3_read_timeout = 600 22 | schema_file = "schema.json" 23 | decoder = "Multi" 24 | 25 | [PipelineBudget] 26 | type = "SandboxFilter" 27 | filename = "lua_filters/dollars.lua" 28 | message_matcher = "Logger == 'PayloadSize' && Type == 'heka.sandbox.payload_size'" 29 | output_limit = 0 30 | instruction_limit = 0 31 | memory_limit = 0 32 | ticker_interval = 60 33 | timer_event_on_shutdown = true 34 | preserve_data = true 35 | 36 | [PipelineBudget.config] 37 | max_per_channel = 90 38 | 39 | [DashboardOutput] 40 | address = ":8080" 41 | static_directory = "/mnt/telemetry/heka/share/heka/dasher" 42 | ticker_interval = 10 43 | -------------------------------------------------------------------------------- /reports/budget/check_targets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | # Check specified submission day, alert if the data volume exceeds target. 9 | # If we exceed any targets for the day, send an alert email. 10 | # Targets may be found at 11 | # s3://net-mozaws-prod-us-west-2-pipeline-metadata/telemetry-2/budget_targets.json 12 | 13 | import sys 14 | import json 15 | import argparse 16 | from boto.ses import connect_to_region as ses_connect 17 | 18 | def pct(actual, expected): 19 | return "{:.1%}".format(float(actual) / expected) 20 | 21 | def gb(num_bytes): 22 | return "{:.2f}GB".format(float(num_bytes) / 1024.0 / 1024.0 / 1024.0) 23 | 24 | def fmt_err(channel, docType, actual, expected): 25 | sign = ">" 26 | if actual < expected: 27 | sign = "<" 28 | return "Channel {}, Type {}: Actual {} {} Expected {} ({})".format( 29 | channel, docType, gb(actual), sign, gb(expected), pct(actual, expected)) 30 | 31 | def main(): 32 | parser = argparse.ArgumentParser(description="Check Budget Targets") 33 | parser.add_argument("--day", help="Day to check (YYYYMMDD)", required=True) 34 | parser.add_argument("--targets-file", help="JSON file containing budget targets", type=file, required=True) 35 | parser.add_argument("--data-file", help="JSON file containing observed data", type=file, required=True) 36 | parser.add_argument("--from-email", help="Email 'from:' address", required=True) 37 | parser.add_argument("--to-email", help="Email 'to:' address (multiple allowed)", action="append", required=True) 38 | parser.add_argument("--dry-run", help="Print out what would happen instead of sending alert email", action="store_true") 39 | parser.add_argument("--verbose", help="Print all the messages", action="store_true") 40 | args = parser.parse_args() 41 | 42 | target_day = args.day 43 | try: 44 | targets = json.load(args.targets_file) 45 | except Exception as e: 46 | print "Error parsing JSON from {}: {}".format(args.targets_file.name, e) 47 | return 2 48 | 49 | try: 50 | data = json.load(args.data_file) 51 | except Exception as e: 52 | print "Error parsing JSON from {}: {}".format(args.data_file.name, e) 53 | return 2 54 | 55 | errors = [] 56 | exit_code = 0 57 | try: 58 | s = data["submission"] 59 | for c in targets.keys(): 60 | if c not in s: 61 | if args.verbose: 62 | print "warning: {} not found in data.".format(c) 63 | continue 64 | if target_day not in s[c]: 65 | if args.verbose: 66 | print "warning: {}/{} not found in data.".format(c, target_day) 67 | continue 68 | 69 | scd = s[c][target_day] 70 | clients = targets[c]["clients"] 71 | for docType in targets[c].keys(): 72 | if docType == "clients": 73 | continue 74 | else: 75 | if docType not in scd: 76 | if args.verbose: 77 | print "warning: {}/{}/{} not found in data.".format(c, target_day, docType) 78 | continue 79 | scdt = scd[docType] 80 | expected_size = targets[c][docType]["size"] * targets[c][docType]["count"] * clients 81 | actual_size = scdt["size"] 82 | if actual_size > expected_size: 83 | errors.append(fmt_err(c, docType, actual_size, expected_size)) 84 | else: 85 | if args.verbose: 86 | print "ok: {}".format(fmt_err(c, docType, actual_size, expected_size)) 87 | except Exception as e: 88 | print "Data error: {}".format(e) 89 | exit_code = 3 90 | 91 | if len(errors) > 0: 92 | message = "Incoming data for {} exceeded budget targets:\n".format(args.day) + "\n".join(sorted(errors)) 93 | subject = "Incoming Telemetry data exceeded budget targets for {}".format(args.day) 94 | if args.dry_run: 95 | print "Dry-run mode. Would have sent:" 96 | print "==============================" 97 | print " From:", args.from_email 98 | print " To:", args.to_email 99 | print "Subject:", subject 100 | print " Body:", message 101 | else: 102 | # ses = ses_connect('us-east-1') 103 | ses = ses_connect('us-west-2') 104 | ses.send_email( 105 | source = args.from_email, 106 | subject = subject, 107 | format = "text", 108 | body = message, 109 | to_addresses = args.to_email 110 | ) 111 | elif args.dry_run: 112 | print "Dry-run mode, but would not have sent any alerts." 113 | 114 | return exit_code 115 | 116 | if __name__ == "__main__": 117 | sys.exit(main()) 118 | -------------------------------------------------------------------------------- /reports/budget/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | VERSION=0.3 4 | 5 | tar czvf budget-report-${VERSION}.tar.gz budget.toml run.sh schema_template.json check_targets.py 6 | -------------------------------------------------------------------------------- /reports/budget/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | OUTPUT=output 3 | if [ ! -d "$OUTPUT" ]; then 4 | mkdir -p "$OUTPUT/sandbox_preservation" 5 | fi 6 | 7 | # If we have an argument, process that day. 8 | TARGET=$1 9 | if [ -z "$TARGET" ]; then 10 | # Default to processing "yesterday" 11 | TARGET=$(date -d 'yesterday' +%Y%m%d) 12 | fi 13 | 14 | # Install heka 15 | wget http://people.mozilla.org/~mreid/heka-20150918-0_11_0-linux-amd64.tar.gz -O heka.tar.gz 16 | tar xzf heka.tar.gz 17 | mv heka-* heka 18 | 19 | echo "Fetching previous state..." 20 | 21 | aws s3 sync s3://telemetry-private-analysis-2/budget-report/data/sandbox_preservation/ "$OUTPUT/sandbox_preservation/" 22 | 23 | sed -r "s/__TARGET__/$TARGET/" schema_template.json > schema.json 24 | heka/bin/hekad -config budget.toml 25 | 26 | # Push json to prod report bucket/path 27 | DATA="$OUTPUT/dashboard/data/PipelineBudget.SubmissionSizesbychannelanddate.json" 28 | aws s3 cp "$DATA" s3://net-mozaws-prod-metrics-data/telemetry-budget-dashboard/budget.json --acl bucket-owner-full-control 29 | 30 | echo "Fetching budget targets" 31 | aws s3 cp s3://net-mozaws-prod-us-west-2-pipeline-metadata/telemetry-2/budget_targets.json ./ 32 | 33 | # Alert if data for $TARGET exceeds expected volume. 34 | ALERT_FROM=telemetry-alerts@mozilla.com 35 | ALERT_TO=$ALERT_FROM 36 | echo "Checking if we've exceeded targets" 37 | python check_targets.py --day $TARGET \ 38 | --targets-file budget_targets.json \ 39 | --data-file "$DATA" \ 40 | --from-email $ALERT_FROM \ 41 | --to-email $ALERT_TO \ 42 | --verbose 43 | -------------------------------------------------------------------------------- /reports/budget/schema_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "dimensions": [ 4 | { "field_name": "submissionDate", "allowed_values": "__TARGET__" }, 5 | { "field_name": "docType", "allowed_values": "*" }, 6 | { "field_name": "channel", "allowed_values": "*" } 7 | ] 8 | } 9 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/hindsight.cfg: -------------------------------------------------------------------------------- 1 | output_path = "output" 2 | output_size = 1024 * 1024 * 1024 3 | sandbox_load_path = "" 4 | sandbox_run_path = "run" 5 | analysis_threads = 1 6 | analysis_lua_path = "/usr/lib/luasandbox/modules/?.lua;/mnt/telemetry/heka/share/heka/lua_modules/?.lua" 7 | analysis_lua_cpath = "/usr/lib/luasandbox/modules/?.so;/mnt/telemetry/heka/share/heka/lua_modules/?.so" 8 | io_lua_path = analysis_lua_path .. ";/usr/lib/luasandbox/io_modules/?.lua;/mnt/telemetry/heka/share/heka/lua_io_modules/?.lua" 9 | io_lua_cpath = analysis_lua_cpath .. ";/usr/lib/luasandbox/io_modules/?.so;/mnt/telemetry/heka/share/heka/lua_io_modules/?.so" 10 | max_message_size = 1024 * 1024 * 1024 11 | backpressure = 2 12 | 13 | analysis_defaults = { 14 | output_limit = 0, 15 | memory_limit = 0, 16 | instruction_limit = 0, 17 | ticker_interval = 0, 18 | preserve_data = false, 19 | } 20 | 21 | input_defaults = { 22 | output_limit = 1024 * 1024 * 8, 23 | instruction_limit = 0, 24 | preserve_data = false, 25 | } 26 | 27 | output_defaults = { 28 | output_limit = 1024 * 1024 * 8, 29 | ticker_interval = 0, 30 | instruction_limit = 0, 31 | memory_limit = 0, 32 | preserve_data = false, 33 | } 34 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | require "io" 6 | require "heka_stream_reader" 7 | require "snappy" 8 | 9 | local hsr = heka_stream_reader.new("stdin") 10 | local dhsr = heka_stream_reader.new("snappy") 11 | 12 | local function snappy_decode(msgbytes) 13 | local ok, uc = pcall(snappy.uncompress, msgbytes) 14 | if ok then 15 | return uc 16 | end 17 | return msgbytes 18 | end 19 | 20 | function process_message() 21 | local fh = assert(io.popen("cat " .. read_config("list") .. " | ../heka/bin/s3cat -bucket='net-mozaws-prod-us-west-2-pipeline-data' -stdin=true")) 22 | local found, consumed, read 23 | repeat 24 | repeat 25 | found, consumed, read = hsr:find_message(fh, false) -- don't protobuf decode 26 | if found then 27 | local pbm = snappy_decode(hsr:read_message("raw")) 28 | local ok = pcall(dhsr.decode_message, dhsr, pbm) 29 | if ok then 30 | inject_message(dhsr) 31 | end 32 | end 33 | until not found 34 | until read == 0 35 | return 0 36 | end 37 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen01.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xaa" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen02.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xab" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen03.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xac" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen04.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xad" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen05.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xae" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen06.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xaf" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen07.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xag" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen08.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xah" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen09.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xai" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen10.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xaj" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen11.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xak" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen12.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xal" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen13.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xam" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen14.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xan" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen15.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xao" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/popen16.cfg: -------------------------------------------------------------------------------- 1 | filename = "popen.lua" 2 | memory_limit = 2000000000 3 | 4 | list = "xap" 5 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/prune_input.cfg: -------------------------------------------------------------------------------- 1 | filename = "prune_input.lua" 2 | ticker_interval = 60 3 | 4 | output_path = "output" 5 | exit_on_stall = true 6 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/input/prune_input.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Hindsight input log file pruner 7 | 8 | Deletes the log files generated by the input plugins, when all the analysis and 9 | output plugins are done consumining them (within ticker_interval seconds). 10 | 11 | *Example Hindsight Configuration* 12 | 13 | .. code-block:: lua 14 | filename = "prune_input.lua" 15 | ticker_interval = 60 16 | 17 | output_path = "output" -- Path to the hindsight.cp file. 18 | exit_on_stall = false -- When true, causes the plugin to stop/abort when the checkpoints are no longer advancing. 19 | -- Use this option to allow hindsight_cli to exit when the inputs are finished. This plugin/option 20 | -- is typically used when streaming a large data set from something like s3 i.e., running 21 | -- a report. 22 | --]] 23 | 24 | require "io" 25 | require "os" 26 | require "string" 27 | require "math" 28 | local l = require "lpeg" 29 | l.locale(l) 30 | 31 | local output_path = read_config("output_path") or error("output_path must be set") 32 | local exit_on_stall = read_config("exit_on_stall") 33 | 34 | local function get_min(t, i, o) 35 | if not t.min then t.min = math.huge end 36 | if i < t.min then 37 | t.min = i 38 | t.off = o 39 | end 40 | return t 41 | end 42 | 43 | local pair = l.P"'" * l.Cg(l.digit^1/tonumber * ":" * l.C(l.digit^1)) * "'" 44 | local ignore = (l.P(1) - "\n")^0 * "\n" 45 | local line = l.P"_G['input->" * (l.P(1) - "'")^1 * "']" * l.space^0 * "=" * l.space^0 * pair * l.space^0 + ignore 46 | local grammar = l.Cf(l.Ct("") * line^1, get_min) 47 | local min, off = -1, -1 48 | 49 | function process_message() 50 | local fh = io.open(output_path .. "/hindsight.cp") 51 | if not fh then return 0 end -- checkpoint file not available yet 52 | 53 | local s = fh:read("*a") 54 | fh:close() 55 | if s then 56 | local t = grammar:match(s) 57 | if t then 58 | if min == t.min and off == t.off then 59 | if exit_on_stall then 60 | error("input has stopped") 61 | end 62 | else 63 | off = t.off 64 | if min ~= t.min then 65 | min = t.min 66 | for i = min - 1, 0, -1 do 67 | local r = os.remove(string.format("%s/input/%d.log", output_path, i)) 68 | if not r then break end 69 | end 70 | end 71 | end 72 | end 73 | end 74 | return 0 75 | end 76 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary.lua: -------------------------------------------------------------------------------- 1 | ../../../heka/share/heka/lua_outputs/crash_summary.lua -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary01.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid < '\016' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary02.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\016' && Uuid < '\032' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary03.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\032' && Uuid < '\048' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary04.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\048' && Uuid < '\064' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary05.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\064' && Uuid < '\080' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary06.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\080' && Uuid < '\096' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary07.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\096' && Uuid < '\112' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary08.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\112' && Uuid < '\128' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary09.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\128' && Uuid < '\144' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary10.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\144' && Uuid < '\160' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary11.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\160' && Uuid < '\176' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary12.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\176' && Uuid < '\192' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary13.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\192' && Uuid < '\208' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary14.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\208' && Uuid < '\224' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary15.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\224' && Uuid < '\240' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/crash_summary16.cfg: -------------------------------------------------------------------------------- 1 | filename = "crash_summary.lua" 2 | message_matcher = "Uuid > '\240' && Type == 'telemetry' && Fields[docType] == 'crash'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary01.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid < '\016' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary02.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\016' && Uuid < '\032' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary03.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\032' && Uuid < '\048' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary04.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\048' && Uuid < '\064' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary05.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\064' && Uuid < '\080' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary06.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\080' && Uuid < '\096' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary07.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\096' && Uuid < '\112' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary08.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\112' && Uuid < '\128' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary09.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\128' && Uuid < '\144' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary10.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\144' && Uuid < '\160' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary11.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\160' && Uuid < '\176' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary12.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\176' && Uuid < '\192' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary13.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\192' && Uuid < '\208' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary14.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\208' && Uuid < '\224' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary15.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\224' && Uuid < '\240' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary16.cfg: -------------------------------------------------------------------------------- 1 | filename = "executive_summary_full.lua" 2 | message_matcher = "Uuid > '\240' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/executive_summary_full.lua: -------------------------------------------------------------------------------- 1 | ../../../heka/share/heka/lua_outputs/executive_summary_full.lua -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary.lua: -------------------------------------------------------------------------------- 1 | ../../../heka/share/heka/lua_outputs/main_summary.lua -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary01.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid < '\016' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary02.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\016' && Uuid < '\032' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary03.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\032' && Uuid < '\048' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary04.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\048' && Uuid < '\064' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary05.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\064' && Uuid < '\080' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary06.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\080' && Uuid < '\096' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary07.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\096' && Uuid < '\112' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary08.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\112' && Uuid < '\128' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary09.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\128' && Uuid < '\144' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary10.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\144' && Uuid < '\160' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary11.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\160' && Uuid < '\176' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary12.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\176' && Uuid < '\192' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary13.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\192' && Uuid < '\208' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary14.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\208' && Uuid < '\224' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary15.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\224' && Uuid < '\240' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/run/output/main_summary16.cfg: -------------------------------------------------------------------------------- 1 | filename = "main_summary.lua" 2 | message_matcher = "Uuid > '\240' && Type == 'telemetry' && Fields[docType] == 'main'" 3 | 4 | format = "redshift.psv" 5 | buffer_path = "../s3output" 6 | buffer_size = 100 * 1024 * 1024 7 | s3_path = "s3://telemetry-private-analysis-2/derived_streams/data" 8 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/schema_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { "field_name": "submissionDate", "allowed_values": "__TARGET__"}, 5 | { "field_name": "sourceName", "allowed_values": "telemetry" }, 6 | { "field_name": "sourceVersion", "allowed_values": "4" }, 7 | { "field_name": "docType", "allowed_values": ["main", "crash"]}, 8 | { "field_name": "appName", "allowed_values": "*" }, 9 | { "field_name": "appUpdateChannel", "allowed_values": "*" }, 10 | { "field_name": "appVersion", "allowed_values": "*" }, 11 | { "field_name": "appBuildId", "allowed_values": "*"} 12 | ] 13 | } 14 | 15 | -------------------------------------------------------------------------------- /reports/derived_streams/derived_streams/splitter.lua: -------------------------------------------------------------------------------- 1 | require "io" 2 | require "string" 3 | 4 | local PARTITIONS = 16 5 | local fhs = {} 6 | for i=1, PARTITIONS do 7 | fhs[i] = assert(io.open(string.format("xa%c", 96 + i ), "w+")) 8 | end 9 | local cnt = 0 10 | 11 | for line in io.lines("list.txt") do 12 | local idx = cnt % PARTITIONS + 1 13 | fhs[idx]:write(line, "\n") 14 | cnt = cnt + 1 15 | end 16 | 17 | for i=1, PARTITIONS do 18 | fhs[i]:close() 19 | end 20 | -------------------------------------------------------------------------------- /reports/derived_streams/hindsight/bin/hindsight: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla-services/data-pipeline/0c94d328f243338d21bae360547c300ac1b82b12/reports/derived_streams/hindsight/bin/hindsight -------------------------------------------------------------------------------- /reports/derived_streams/hindsight/bin/hindsight_cli: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla-services/data-pipeline/0c94d328f243338d21bae360547c300ac1b82b12/reports/derived_streams/hindsight/bin/hindsight_cli -------------------------------------------------------------------------------- /reports/derived_streams/luasandbox-0.10.2-Linux-core.deb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla-services/data-pipeline/0c94d328f243338d21bae360547c300ac1b82b12/reports/derived_streams/luasandbox-0.10.2-Linux-core.deb -------------------------------------------------------------------------------- /reports/derived_streams/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | VERSION=0.11 4 | 5 | # Git doesn't keep empty dirs :( Ensure that all necessary dirs are present. 6 | mkdir -p derived_streams/run/analysis 7 | mkdir -p derived_streams/run/input 8 | mkdir -p derived_streams/run/output 9 | 10 | tar czvf derived_streams-v4-${VERSION}.tar.gz derived_streams hindsight luasandbox-0.10.2-Linux-core.deb run.sh snappy.so 11 | -------------------------------------------------------------------------------- /reports/derived_streams/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install dependencies 4 | sudo apt-get --yes install lua5.1 postgresql-client jq 5 | sudo dpkg -i luasandbox-0.10.2-Linux-core.deb 6 | 7 | OUTPUT=output 8 | if [ ! -d "$OUTPUT" ]; then 9 | mkdir -p "$OUTPUT" 10 | fi 11 | 12 | S3OUTPUT=s3output 13 | if [ ! -d "$S3OUTPUT" ]; then 14 | mkdir -p "$S3OUTPUT" 15 | fi 16 | 17 | # Install dependencies 18 | wget http://people.mozilla.org/~mtrinkala/heka-20151124-0_11_0-linux-amd64.tar.gz -O heka.tar.gz 19 | tar xzf heka.tar.gz 20 | 21 | # Rename the dir to make it easier to refer to 22 | mv heka-* heka 23 | cp snappy.so heka/share/heka/lua_modules/ 24 | 25 | cd derived_streams 26 | # If we have an argument, process that day. 27 | TARGET=$1 28 | if [ -z "$TARGET" ]; then 29 | # Default to processing "yesterday" 30 | TARGET=$(date -d 'yesterday' +%Y%m%d) 31 | fi 32 | 33 | # Update schema with target: 34 | sed -r "s/__TARGET__/$TARGET/" schema_template.json > schema.json 35 | 36 | # Fetch metadata 37 | META=net-mozaws-prod-us-west-2-pipeline-metadata 38 | # Get metadata: 39 | aws s3 cp s3://$META/sources.json ./ 40 | 41 | # Get the Telemetry data location 42 | BUCKET=$(jq -r '.["telemetry"].bucket' < sources.json) 43 | PREFIX=$(jq -r '.["telemetry"].prefix' < sources.json) 44 | 45 | # Run code: 46 | ../heka/bin/heka-s3list -schema schema.json -bucket="$BUCKET" -bucket-prefix="$PREFIX" > list.txt 47 | lua splitter.lua 48 | ../hindsight/bin/hindsight_cli hindsight.cfg 7 49 | 50 | RV=$? 51 | 52 | if [ $RV -ne 0 ]; then 53 | echo "Hindsight encountered an error, returned a value of $RV. Not proceeding with DB load." 54 | exit $RV 55 | fi 56 | 57 | echo "Loading data for $TARGET into Redshift..." 58 | 59 | ## TODO: We assume these are all in the same database. Should fetch credentials 60 | # for each table separately. 61 | META_PREFIX=$(jq -r '.["telemetry-executive-summary-db"]["metadata_prefix"]' < sources.json) 62 | # Get read-write credentials: 63 | aws s3 cp s3://$META/$META_PREFIX/write/credentials.json ./ 64 | 65 | DB_HOST=$(jq -r '.host' < credentials.json) 66 | DB_PORT=$(jq -r '.port' < credentials.json) 67 | DB_NAME=$(jq -r '.db_name' < credentials.json) 68 | DB_USER=$(jq -r '.username' < credentials.json) 69 | DB_PASS=$(jq -r '.password' < credentials.json) 70 | 71 | # Install these credentials for psql to use 72 | # See http://www.postgresql.org/docs/current/static/libpq-pgpass.html 73 | echo "$DB_HOST:$DB_PORT:$DB_NAME:$DB_USER:$DB_PASS" >> ~/.pgpass 74 | chmod 0600 ~/.pgpass 75 | 76 | PQ="psql -U $DB_USER -h $DB_HOST -p $DB_PORT $DB_NAME" 77 | 78 | # Fetch AWS credentials for IAM role 79 | # See http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html#instance-metadata-security-credentials 80 | IAM_ROLE_NAME=$(curl http://169.254.169.254/latest/meta-data/iam/security-credentials/) 81 | curl http://169.254.169.254/latest/meta-data/iam/security-credentials/${IAM_ROLE_NAME} > aws_creds.json 82 | AWS_KEY=$(jq -r '.AccessKeyId' < aws_creds.json) 83 | AWS_SECRET_KEY=$(jq -r '.SecretAccessKey' < aws_creds.json) 84 | TOKEN=$(jq -r '.Token' < aws_creds.json) 85 | 86 | # See http://docs.aws.amazon.com/redshift/latest/dg/copy-parameters-credentials.html 87 | CREDS="aws_access_key_id=${AWS_KEY};aws_secret_access_key=${AWS_SECRET_KEY};token=${TOKEN}" 88 | for t in main crash executive; do 89 | NEW_TABLE="${t}_summary_${TARGET}" 90 | echo "Copying data for $NEW_TABLE..." 91 | $PQ -c "CREATE TABLE IF NOT EXISTS $NEW_TABLE (LIKE ${t}_summary including defaults);" 92 | $PQ -c "COPY $NEW_TABLE FROM 's3://telemetry-private-analysis-2/derived_streams/data/${NEW_TABLE}' CREDENTIALS '$CREDS' ACCEPTANYDATE TRUNCATECOLUMNS ESCAPE ACCEPTINVCHARS as ' ';" 93 | $PQ -c "GRANT SELECT ON $NEW_TABLE TO read_only;" 94 | done 95 | -------------------------------------------------------------------------------- /reports/derived_streams/snappy.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla-services/data-pipeline/0c94d328f243338d21bae360547c300ac1b82b12/reports/derived_streams/snappy.so -------------------------------------------------------------------------------- /reports/engagement_ratio/README.txt: -------------------------------------------------------------------------------- 1 | To Deploy 2 | ========= 3 | 4 | 1. Log in to Telemetry Self-Serve Data Analysis 5 | 2. Click 'Schedule a Spark Job' 6 | 3. Edit or create a job with the following parameters: 7 | Job Name: telemetry-engagement-ratio 8 | Notebook or Jar: Upload MauDau.ipynb 9 | Spark Submission Args: N/A 10 | Cluster Size: 10 11 | Output Visibility: Public 12 | Schedule Frequency: Daily 13 | Day of Week: N/A (Sunday) 14 | Day of Month: N/A (1) 15 | Time of Day (UTC): 4am 16 | Job Timeout (minutes): 300 17 | -------------------------------------------------------------------------------- /reports/executive_summary/README.txt: -------------------------------------------------------------------------------- 1 | To Deploy 2 | ========= 3 | 4 | 1. Run 'package.sh' to create executive-report-v4-0.X.tar.gz 5 | 2. Log in to Telemetry Self-Serve Data Analysis 6 | 3. Click 'Schedule a job' 7 | 4. Edit or create a job with the following parameters: 8 | 4a. Weekly: 9 | Job Name: executive-report-weekly 10 | Code Tarball: Upload executive-report-v4-0.X.tar.gz 11 | Execution Commandline: ./run.sh weekly 12 | Output Directory: output 13 | Output Visibility: Private 14 | Schedule Frequency: Weekly 15 | Day of Week: Monday 16 | Day of Month: n/a (1) 17 | Time of Day (UTC): 10am 18 | Job Timeout (minutes): 300 19 | 4b. Monthly: 20 | Job Name: executive-report-monthly 21 | Code Tarball: Upload executive-report-v4-0.X.tar.gz 22 | Execution Commandline: ./run.sh monthly 23 | Output Directory: output 24 | Output Visibility: Private 25 | Schedule Frequency: Monthly 26 | Day of Week: n/a (Sunday) 27 | Day of Month: 1 (It will run for the previous month) 28 | Time of Day (UTC): 10am 29 | Job Timeout (minutes): 600 30 | -------------------------------------------------------------------------------- /reports/executive_summary/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | VERSION=0.14 4 | tar czvf executive-report-v4-${VERSION}.tar.gz run.sh run_executive_report.py reformat_v4.py 5 | -------------------------------------------------------------------------------- /reports/executive_summary/reformat_v4.py: -------------------------------------------------------------------------------- 1 | ''' 2 | reformat_v4.py 3 | -------------- 4 | 5 | This script automatically formats any of the v4 rollups generated by Trink 6 | into more consumable dashboard-friendly formats. It simplifies some fields 7 | and creates 'all' categories for the facets so the js on the frontend has to 8 | do only a little bit of processing before everything hits crossfilter. 9 | 10 | 11 | ''' 12 | 13 | import csv 14 | import argparse 15 | import datetime 16 | 17 | parser = argparse.ArgumentParser(description='Reformats the v4 data') 18 | parser.add_argument('-f', '--file', type=str, help='input file to be converted') 19 | parser.add_argument('-o', '--output', type=str, help='output file') 20 | args = parser.parse_args() 21 | 22 | INPUT = args.file 23 | OUTPUT = args.output 24 | 25 | 26 | f = open(INPUT, 'r') 27 | r = csv.reader(f) 28 | 29 | headers = r.next() 30 | COUNTRIES = set(['US','CA','BR','MX','FR','ES','IT','PL','TR','RU','DE','IN','ID','CN','JP','GB']) 31 | OSES = {'WINNT': 'Windows', "Darwin": "Mac", "Linux": "Linux", 'Other':'Other'} 32 | CHANNELS = set(['release', 'beta', 'aurora', 'nightly']) 33 | data_keys = ['actives', 'hours','inactives','new_records', 'five_of_seven', 'total_records', 'crashes', 'default','google', 'bing', 'yahoo', 'other']; 34 | out={} 35 | 36 | def num(s): 37 | try: 38 | return int(s) 39 | except ValueError: 40 | return float(s) 41 | total=0 42 | 43 | for line in r: 44 | 45 | line = dict(zip(headers,line)) 46 | if line['date'] < datetime.datetime.now().strftime('%Y-%m-%d'): 47 | # Don't re-aggregate 'all' lines 48 | if line['geo'] == 'all' or line['channel'] == 'all' or line['os'] == 'all': 49 | continue 50 | 51 | if line['geo'] not in COUNTRIES: line['geo']='Other' 52 | if line['channel'] not in CHANNELS: line['channel'] = 'Other' 53 | for geo in ['all', line['geo']]: 54 | if geo not in out: out[geo]={} 55 | for channel in ['all', line['channel']]: 56 | if channel not in out[geo]: out[geo][channel]={} 57 | for os in ['all', line['os']]: 58 | if os not in out[geo][channel]: out[geo][channel][os]={} 59 | dt = line['date'] 60 | if dt not in out[geo][channel][os]: out[geo][channel][os][dt]={} 61 | for d in data_keys: 62 | if d not in out[geo][channel][os][dt]: out[geo][channel][os][dt][d]=0 63 | out[geo][channel][os][dt][d]+=num(line[d]) 64 | 65 | w = csv.writer(open(OUTPUT, 'w')) 66 | w.writerow(headers) 67 | 68 | for g in out: 69 | for c in out[g]: 70 | for o in out[g][c]: 71 | for dt in out[g][c][o]: 72 | data_values = [out[g][c][o][dt][_] for _ in data_keys] 73 | w.writerow([g,c,o,dt] + data_values) 74 | -------------------------------------------------------------------------------- /reports/executive_summary/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | USAGE="Usage: bash $0 {monthly|weekly} [report_start_yyyymmdd]\nIf not specified, report start defaults to the most recent completed reporting period." 4 | OUTPUT=output 5 | if [ ! -d "$OUTPUT" ]; then 6 | mkdir -p "$OUTPUT" 7 | fi 8 | 9 | # First argument is "mode". It is required. 10 | MODE=$1 11 | if [ "$MODE" != "weekly" -a "$MODE" != "monthly" ]; then 12 | echo "Error: specify 'weekly' or 'monthly' report mode." 13 | echo -e $USAGE 14 | exit 1 15 | fi 16 | 17 | # If we have a date argument, use that as the report start day. 18 | TARGET=$2 19 | if [ -z "$TARGET" ]; then 20 | # Default to processing the most recent completed reporting period. 21 | # For a week, that is the period ending on the most recent Saturday (and 22 | # starting on the prior Sunday) 23 | # For a month, it is the period starting on the first of the previous month. 24 | if [ "$MODE" = "weekly" ]; then 25 | # The Sunday of the previous complete week 26 | TARGET=$(date -d 'last sunday - 1 week' +%Y%m%d) 27 | else 28 | # The first day of the previous complete month 29 | TARGET=$(date -d '1 month ago' +%Y%m01) 30 | fi 31 | fi 32 | 33 | echo "Running $MODE report for period starting on $TARGET" 34 | 35 | # Make sure we have 'jq' and other prereqs 36 | export DEBIAN_FRONTEND=noninteractive; sudo apt-get --yes --force-yes install jq libpq-dev python-dev 37 | sudo pip install psycopg2 38 | 39 | # Fetch db connection details 40 | META=net-mozaws-prod-us-west-2-pipeline-metadata 41 | # Get metadata: 42 | aws s3 cp s3://$META/sources.json ./ 43 | RC=$? 44 | # Check if the copy succeeded. See: 45 | # http://docs.aws.amazon.com/cli/latest/topic/return-codes.html 46 | if [ "$RC" -ne "0" ]; then 47 | echo "ERROR $RC fetching data sources." 48 | exit 2 49 | fi 50 | META_PREFIX=$(jq -r '.["telemetry-executive-summary-db"]["metadata_prefix"]' < sources.json) 51 | # Get read-only credentials: 52 | aws s3 cp s3://$META/$META_PREFIX/read/credentials.json ./ 53 | RC=$? 54 | if [ "$RC" -ne "0" ]; then 55 | echo "ERROR $RC fetching read credentials." 56 | exit 3 57 | fi 58 | 59 | DB_HOST=$(jq -r '.host' < credentials.json) 60 | DB_PORT=$(jq -r '.port' < credentials.json) 61 | DB_NAME=$(jq -r '.db_name' < credentials.json) 62 | DB_USER=$(jq -r '.username' < credentials.json) 63 | DB_PASS=$(jq -r '.password' < credentials.json) 64 | 65 | # Code expects a URL of the form: 66 | # postgresql://username:password@hostname:port/dbname 67 | DB_URL="postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:${DB_PORT}/${DB_NAME}" 68 | 69 | CURRENT="$OUTPUT/executive_report.${MODE}.${TARGET}.csv" 70 | time python run_executive_report.py \ 71 | --verbose \ 72 | --check-tables \ 73 | --db-url "$DB_URL" \ 74 | --report-start $TARGET \ 75 | --mode $MODE > "$CURRENT" 76 | 77 | RC=$? 78 | if [ "$RC" -ne "0" ]; then 79 | echo "ERROR $RC running report." 80 | exit 5 81 | fi 82 | 83 | OVERALL="v4-${MODE}.csv" 84 | DASHBOARD_S3="s3://net-mozaws-prod-metrics-data/firefox-dashboard" 85 | echo "Fetching previous state from $OVERALL..." 86 | aws s3 cp "$DASHBOARD_S3/$OVERALL" ./ 87 | RC=$? 88 | 89 | if [ -s "$OVERALL" ]; then 90 | if [ "$RC" -eq "0" ]; then 91 | echo "Backing up previous state" 92 | # If we have an existing file, back it up. 93 | cp "$OVERALL" "$OUTPUT/${OVERALL}.pre_${TARGET}" 94 | gzip "$OUTPUT/${OVERALL}.pre_${TARGET}" 95 | # TODO: Should we grep -v the TARGET date, replacing instead of potentially 96 | # duplicating? 97 | else 98 | echo "ERROR $RC fetching previous state, aborting." 99 | exit 4 100 | fi 101 | else 102 | echo "No previous state found, starting fresh" 103 | # If we don't have a previous state, add the header line from this run. 104 | head -n 1 "$CURRENT" > "$OVERALL" 105 | fi 106 | 107 | echo "Checking if the csv header is the same. Diffs:" 108 | HEADER_DIFFS=$(diff <(head -n 1 $OVERALL) <(head -n 1 $CURRENT)) 109 | if [ ! -z "$HEADER_DIFFS" ]; then 110 | echo "WARNING: headers were different. <<>>current" 111 | echo $HEADER_DIFFS 112 | else 113 | echo "None. Headers match." 114 | fi 115 | 116 | echo "Appending current data to overall state (minus header)" 117 | tail -n +2 "$CURRENT" >> "$OVERALL" 118 | 119 | # Run the cleanup script 120 | python reformat_v4.py --file "$OVERALL" --output "$OVERALL" 121 | 122 | echo "Uploading updated state back to dashboard bucket" 123 | # Upload the state back. 124 | aws s3 cp "$OVERALL" "$DASHBOARD_S3/" --acl bucket-owner-full-control 125 | RC=$? 126 | if [ "$RC" -ne "0" ]; then 127 | echo "ERROR $RC re-uploading to dashbord bucket ($DASHBOARD_S3)." 128 | fi 129 | 130 | # Then stick it in the output dir 131 | mv "$OVERALL" "$OUTPUT/" 132 | 133 | # And finally gzip it. 134 | gzip "$OUTPUT/$OVERALL" 135 | -------------------------------------------------------------------------------- /reports/fennec_dashboard/README.txt: -------------------------------------------------------------------------------- 1 | To Deploy 2 | ========= 3 | Until Bug 1258685 lands, the notebook will automatically select the operating mode ("weekly" or 4 | "monthly") based on the notebook file name. For this reason, two different Spark jobs need 5 | to be scheduled. 6 | 7 | Weekly aggregation 8 | ------------------ 9 | 10 | 1. Log in to Telemetry Self-Serve Data Analysis 11 | 2. Click 'Schedule a Spark Job' 12 | 3. Edit or create a job with the following parameters: 13 | Job Name: telemetry-fennec-dashboard-weekly 14 | Notebook or Jar: summarize_csv_weekly.ipynb 15 | Spark Submission Args: N/A 16 | Cluster Size: 5 17 | Output Visibility: Private 18 | Schedule Frequency: Weekly 19 | Day of Week: N/A (Sunday) 20 | Day of Month: N/A (1) 21 | Time of Day (UTC): 4am 22 | Job Timeout (minutes): 300 23 | 24 | Monthly aggregation 25 | ------------------ 26 | 27 | 1. Log in to Telemetry Self-Serve Data Analysis 28 | 2. Click 'Schedule a Spark Job' 29 | 3. Edit or create a job with the following parameters: 30 | Job Name: telemetry-fennec-dashboard-monthly 31 | Notebook or Jar: summarize_csv_monthly.ipynb 32 | Spark Submission Args: N/A 33 | Cluster Size: 10 34 | Output Visibility: Private 35 | Schedule Frequency: Monthly 36 | Day of Week: N/A (Sunday) 37 | Day of Month: N/A (1) 38 | Time of Day (UTC): 4am 39 | Job Timeout (minutes): 300 40 | -------------------------------------------------------------------------------- /reports/loop/hindsight.cfg: -------------------------------------------------------------------------------- 1 | output_path = "output" 2 | output_size = 1024 * 1024 * 1024 3 | sandbox_load_path = "" 4 | sandbox_run_path = "run" 5 | analysis_threads = 1 6 | analysis_lua_path = "/usr/lib/luasandbox/modules/?.lua;/mnt/work/heka/share/heka/lua_modules/?.lua" 7 | analysis_lua_cpath = "/usr/lib/luasandbox/modules/?.so;/mnt/work/heka/share/heka/lua_modules/?.so" 8 | io_lua_path = analysis_lua_path .. ";/usr/lib/luasandbox/io_modules/?.lua;/mnt/work/heka/share/heka/lua_io_modules/?.lua" 9 | io_lua_cpath = analysis_lua_cpath .. ";/usr/lib/luasandbox/io_modules/?.so;/mnt/work/heka/share/heka/lua_io_modules/?.so" 10 | max_message_size = 8 * 1024 * 1024 11 | backpressure = 2 12 | 13 | analysis_defaults = { 14 | output_limit = 0, 15 | memory_limit = 0, 16 | instruction_limit = 0, 17 | ticker_interval = 0, 18 | preserve_data = false, 19 | } 20 | 21 | input_defaults = { 22 | output_limit = 8 * 1024 * 1024, 23 | instruction_limit = 0, 24 | preserve_data = false, 25 | } 26 | 27 | output_defaults = { 28 | output_limit = 8 * 1024 * 1024, 29 | ticker_interval = 0, 30 | instruction_limit = 0, 31 | memory_limit = 0, 32 | preserve_data = false, 33 | } 34 | 35 | -------------------------------------------------------------------------------- /reports/loop/run/analysis/hll_check.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Sanity check to make sure the low loop traffic volumes don't throw off the 7 | loop hyperloglog results too much. 8 | --]] 9 | 10 | require "hyperloglog" 11 | require "math" 12 | 13 | local days = {} 14 | 15 | function process_message() 16 | local day = math.floor(read_message("Timestamp") / 1e9 / 86400) 17 | local d = days[day] 18 | if not d then 19 | d = {hyperloglog.new(), {}} 20 | days[day] = d 21 | end 22 | local p = read_message("Fields[uid]") 23 | d[1]:add(p) 24 | d[2][p] = true 25 | return 0 26 | end 27 | 28 | function timer_event(ns, shutdown) 29 | for k,v in pairs(days) do 30 | local cnt = v[1]:count() 31 | local acnt = 0 32 | for m,n in pairs (v[2]) do 33 | acnt = acnt + 1 34 | end 35 | print(k, "hll", cnt, "actual", acnt, "percentage", cnt/acnt) 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /reports/loop/run/analysis/hll_check.off: -------------------------------------------------------------------------------- 1 | filename = "hll_check.lua" 2 | message_matcher = "Logger == 'mozilla-loop-server' && Type == 'request.summary' && Fields[uid] != NIL && Fields[action] == 'join' && (Fields[userType] == 'Unregistered' || Fields[userType] == 'Registered')" 3 | thread = 0 4 | -------------------------------------------------------------------------------- /reports/loop/run/analysis/retention.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | WARNING THIS PLUGIN EXPECTS THE DATA ORDERED BY DAY WITH NO GAPS. 7 | --]] 8 | 9 | require "cjson" 10 | require "math" 11 | require "os" 12 | require "table" 13 | 14 | local DAY_OFFSET = 4 -- start the week on Sunday and correct for the Unix epoch landing on a Thursday 15 | local SEC_IN_DAY = 60 * 60 * 24 16 | local SEC_IN_WEEK = SEC_IN_DAY * 7 17 | 18 | local COHORT = 1 19 | local DAY = 2 20 | local uids = {} -- each key has an array columns: cohort, day, interval flag 21 | 22 | local interval_days = read_config("interval_days") or error("an interval_days must be configured") 23 | 24 | function process_message() 25 | local day = math.floor(read_message("Timestamp") / 1e9 / SEC_IN_DAY) 26 | local week = math.floor((day + DAY_OFFSET) / 7) 27 | local cohort = week * SEC_IN_WEEK - (SEC_IN_DAY * DAY_OFFSET) 28 | local uid = read_message("Fields[uid]") 29 | 30 | local u = uids[uid] 31 | if not u then 32 | u = {cohort, day} 33 | uids[uid] = u 34 | return 0 35 | end 36 | local delta = day - u[DAY] 37 | if delta <= 0 then return 0 end 38 | 39 | local interval = math.floor((delta - 1) / interval_days) 40 | local cinterval = #u - 2 41 | if interval == cinterval then 42 | u[cinterval + 3] = true 43 | end 44 | return 0 45 | end 46 | 47 | 48 | function timer_event(ns, shutdown) 49 | local cohorts = {} 50 | for k, u in pairs(uids) do 51 | local cohort = u[COHORT] 52 | local c = cohorts[cohort] 53 | if not c then 54 | c = {user_count = 1, intervals = {}} 55 | cohorts[cohort] = c 56 | else 57 | c.user_count = c.user_count + 1 58 | end 59 | for i, j in ipairs(u) do 60 | if i > 2 and j then -- skip the cohort and day entries 61 | local value = c.intervals[i - 2] 62 | if not value then 63 | c.intervals[i - 2] = 1 64 | else 65 | c.intervals[i - 2] = value + 1 66 | end 67 | end 68 | end 69 | end 70 | 71 | local json = {interval_days = interval_days, cohorts = {}} 72 | for k, c in pairs(cohorts) do 73 | json.cohorts[#json.cohorts + 1] = {cohort = os.date("%Y%m%d", k), cohort_user_count = c.user_count, interval_counts = c.intervals} 74 | end 75 | table.sort(json.cohorts, function(t1, t2) return t1.cohort < t2.cohort end) 76 | inject_payload("json", "retention", cjson.encode(json)) 77 | end 78 | -------------------------------------------------------------------------------- /reports/loop/run/analysis/retention_daily.cfg: -------------------------------------------------------------------------------- 1 | filename = "retention.lua" 2 | message_matcher = "Logger == 'mozilla-loop-server' && Type == 'request.summary' && Fields[uid] != NIL && Fields[action] == 'join' && (Fields[userType] == 'Unregistered' || Fields[userType] == 'Registered')" 3 | interval_days = 1 4 | thread = 1 5 | -------------------------------------------------------------------------------- /reports/loop/run/analysis/retention_monthly.cfg: -------------------------------------------------------------------------------- 1 | filename = "retention.lua" 2 | message_matcher = "Logger == 'mozilla-loop-server' && Type == 'request.summary' && Fields[uid] != NIL && Fields[action] == 'join' && (Fields[userType] == 'Unregistered' || Fields[userType] == 'Registered')" 3 | interval_days = 28 4 | thread = 2 5 | -------------------------------------------------------------------------------- /reports/loop/run/analysis/retention_weekly.cfg: -------------------------------------------------------------------------------- 1 | filename = "retention.lua" 2 | message_matcher = "Logger == 'mozilla-loop-server' && Type == 'request.summary' && Fields[uid] != NIL && Fields[action] == 'join' && (Fields[userType] == 'Unregistered' || Fields[userType] == 'Registered')" 3 | interval_days = 7 4 | thread = 3 5 | -------------------------------------------------------------------------------- /reports/loop/run/analysis/xau.cfg: -------------------------------------------------------------------------------- 1 | filename = "xau.lua" 2 | message_matcher = "Logger == 'mozilla-loop-server' && Type == 'request.summary' && Fields[uid] != NIL && Fields[action] == 'join' && (Fields[userType] == 'Unregistered' || Fields[userType] == 'Registered')" 3 | thread = 0 4 | -------------------------------------------------------------------------------- /reports/loop/run/analysis/xau.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | WARNING THIS PLUGIN EXPECTS THE DATA ORDERED BY DAY WITH NO GAPS. 7 | 8 | If it is going to be run for more than this one-off we should make it more robust 9 | --]] 10 | 11 | 12 | require "cjson" 13 | require "hyperloglog" 14 | require "math" 15 | require "os" 16 | require "table" 17 | 18 | local SEC_IN_DAY = 60 * 60 * 24 19 | 20 | local days = {} 21 | local cday = -1 22 | local hll 23 | 24 | local function compute_range(i, len) 25 | if i == 1 then return nil end 26 | local s = i - (len - 1) 27 | if s < 1 then 28 | s = 1 29 | end 30 | return hyperloglog.count(unpack(days, s, i)) 31 | end 32 | 33 | function process_message() 34 | local day = math.floor(read_message("Timestamp") / 1e9 / SEC_IN_DAY) 35 | if cday ~= -1 and (day < cday or day > cday + 1) then 36 | print("day", os.date("%Y%m%d", day * SEC_IN_DAY) , "cday", os.date("%Y%m%d", cday * SEC_IN_DAY)) 37 | error("data is out of order or has gaps") 38 | end 39 | 40 | if day ~= cday then 41 | hll = hyperloglog.new() 42 | days[#days + 1] = hll 43 | cday = day 44 | end 45 | hll:add(read_message("Fields[uid]")) 46 | return 0 47 | end 48 | 49 | function timer_event(ns, shutdown) 50 | local fday = cday - #days 51 | local json = {} 52 | for i, v in ipairs(days) do 53 | local dau = v:count() 54 | local wau = compute_range(i, 7) or dau 55 | local mau = compute_range(i, 28) or dau 56 | json[#json + 1] = {date = os.date("%Y%m%d", (fday + i) * SEC_IN_DAY), dau = dau, wau = wau, mau = mau} 57 | end 58 | table.sort(json, function(t1, t2) return t1.date < t2.date end) 59 | inject_payload("json", "xau", cjson.encode(json)) 60 | end 61 | -------------------------------------------------------------------------------- /reports/loop/run/input/server_logs.cfg: -------------------------------------------------------------------------------- 1 | filename = "server_logs.lua" 2 | start_date = "2015-11-01" 3 | end_date = "2016-05-12" 4 | service = "^loop%-app" 5 | -------------------------------------------------------------------------------- /reports/loop/run/input/server_logs.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | Reads the files list application server logs from S3 for reporting.. 7 | 8 | Config: 9 | 10 | filename = "server_logs.lua" 11 | start_date = "2015-11-01" 12 | end_date = "2016-05-11" 13 | service = "^loop%-app" 14 | --]] 15 | 16 | require "heka_stream_reader" 17 | require "io" 18 | require "os" 19 | require "string" 20 | 21 | local date_format = "^(%d%d%d%d)%-(%d%d)%-(%d%d)$" 22 | local service = read_config("service") or "." 23 | local start_date = read_config("start_date") 24 | local end_date = read_config("end_date") 25 | 26 | local syear, smonth, sday = start_date:match(date_format) 27 | start_date = os.time({year = syear, month = smonth, day = sday}) 28 | 29 | local eyear, emonth, eday = end_date:match(date_format) 30 | end_date = os.time({year = eyear, month = emonth, day = eday}) 31 | 32 | assert(end_date >= start_date, "end_date must be greater than or equal to the start_date") 33 | local num_months = (eyear * 12 + emonth) - (syear * 12 + smonth) 34 | 35 | local function get_file_list(year, month) 36 | local path = string.format("s3://heka-logs/shared/%04d-%02d/", year, month) 37 | local list = {} 38 | 39 | local fh = assert(io.popen(string.format("aws s3 ls %s", path))) 40 | for line in fh:lines() do 41 | local fn, ds = string.match(line, "^%d%d%d%d%-%d%d%-%d%d%s+%d%d:%d%d:%d%d%s+%d+%s+(.-%-(%d%d%d%d%d%d%d%d)_.+)") 42 | if ds then 43 | ds = os.time({year = ds:sub(1, 4), month = ds:sub(5, 6), day = ds:sub(7, 8)}) 44 | if fn and string.match(fn, service) and ds >= start_date and ds <= end_date then 45 | list[#list + 1] = fn 46 | end 47 | end 48 | end 49 | fh:close() 50 | return path, list 51 | end 52 | 53 | 54 | local msg = { 55 | Timestamp = 0, 56 | Type = "", 57 | Logger = "", 58 | Fields = { 59 | action = "", 60 | userType = "", 61 | uid = "" 62 | } 63 | } 64 | 65 | function process_message() 66 | local year = tonumber(syear) 67 | local month = tonumber(smonth) 68 | for i=0, num_months do 69 | local path, list = get_file_list(year, month) 70 | for i,fn in ipairs(list) do 71 | local hsr = heka_stream_reader.new(path) 72 | print("processing", fn) 73 | local fh = assert(io.popen(string.format("aws s3 cp %s%s - | gzip -d -c", path, fn))) 74 | local found, consumed, read 75 | repeat 76 | repeat 77 | found, consumed, read = hsr:find_message(fh) 78 | if found then 79 | -- inject_message(hsr) -- todo remove loop filtering 80 | 81 | -- filtering/data reduction for loop testing 82 | local action = hsr:read_message("Fields[action]") 83 | local userType = hsr:read_message("Fields[userType]") 84 | local uid = hsr:read_message("Fields[uid]") 85 | if uid and action == "join" and (userType == "Unregistered" or userType == "Registered") then 86 | msg.Timestamp = hsr:read_message("Timestamp") 87 | msg.Type = hsr:read_message("Type") 88 | msg.Logger = hsr:read_message("Logger") 89 | msg.Fields.action = action 90 | msg.Fields.userType = userType 91 | msg.Fields.uid = uid 92 | inject_message(msg) 93 | end 94 | -- end loop testing 95 | end 96 | until not found 97 | until read == 0 98 | fh:close() 99 | end 100 | month = month + 1 101 | if month == 13 then 102 | month = 1 103 | year = year + 1 104 | end 105 | end 106 | return 0 107 | end 108 | -------------------------------------------------------------------------------- /reports/loop/run/output/placeholder.off: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla-services/data-pipeline/0c94d328f243338d21bae360547c300ac1b82b12/reports/loop/run/output/placeholder.off -------------------------------------------------------------------------------- /reports/stability-summary/run.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | sudo apt-get -y install postgresql-client python-psycopg2 jq 4 | sudo easy_install boto3 5 | 6 | # Fetch metadata 7 | META=net-mozaws-prod-us-west-2-pipeline-metadata 8 | # Get metadata: 9 | aws s3 cp s3://$META/sources.json ./ 10 | 11 | JOBNAME=telemetry-executive-summary-db 12 | META_PREFIX=$(jq -r ".[\"${JOBNAME}\"][\"metadata_prefix\"]" < sources.json) 13 | 14 | aws s3 cp s3://$META/$META_PREFIX/write/credentials.json ./ 15 | 16 | DB_HOST=$(jq -r '.["host"]' < credentials.json) 17 | DB_NAME=$(jq -r '.["db_name"]' < credentials.json) 18 | DB_USER=$(jq -r '.["username"]' < credentials.json) 19 | DB_PW=$(jq -r '.["password"]' < credentials.json) 20 | 21 | CONNECTION_STRING="host=$DB_HOST dbname=$DB_NAME user=$DB_USER password=$DB_PW" 22 | 23 | echo "running rollup.py" 24 | python rollup.py -d "$CONNECTION_STRING" 25 | -------------------------------------------------------------------------------- /reports/stability-summary/summarize.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | from utils import S3CompressedReader, S3CompressedWriter, HeaderCSVReader 4 | from collections import defaultdict, Counter 5 | from itertools import izip, count 6 | 7 | default_bucket = 'telemetry-public-analysis-2' 8 | 9 | prop_list = ( 10 | 'abortedsessioncount', 11 | 'subsessionlengths', 12 | 'abortsplugin', 13 | 'abortscontent', 14 | 'abortsgmplugin', 15 | 'crashesdetectedplugin', 16 | 'pluginhangs', 17 | 'crashesdetectedcontent', 18 | 'crashesdetectedgmplugin', 19 | 'crashsubmitattemptmain', 20 | 'crashsubmitattemptcontent', 21 | 'crashsubmitattemptplugin', 22 | 'crashsubmitsuccessmain', 23 | 'crashsubmitsuccesscontent', 24 | 'crashsubmitsuccessplugin') 25 | 26 | class Counts(object): 27 | def __init__(self): 28 | self._counts = [0] * len(prop_list) 29 | self.crashes = 0 30 | 31 | def increment(self, i, v): 32 | self._counts[i] += v 33 | 34 | def final(self, **kwargs): 35 | d = dict(izip(prop_list, self._counts)) 36 | d.update(kwargs) 37 | d['crashesdetectedmain'] = self.crashes 38 | return d 39 | 40 | def nullint(v): 41 | if v == '': 42 | return 0 43 | return int(v) 44 | 45 | def summarize(date): 46 | """ 47 | read the large CSV file produced by rollup.put_counts and 48 | rollup.put_crashes into a smaller summary JSON format for quick overview 49 | graphing. 50 | """ 51 | 52 | counts = defaultdict(Counts) 53 | 54 | counts_path = 'stability-rollups/{year}/{date}-main.csv.gz'.format( 55 | year=date.year, date=date.strftime('%Y%m%d')) 56 | csvheaders, reader = HeaderCSVReader( 57 | S3CompressedReader(default_bucket, counts_path)) 58 | key_indexes = [csvheaders.index(prop) 59 | for prop in ('channel', 'buildid', 'os')] 60 | csv_indexes = [(csvheaders.index(prop), propidx) 61 | for propidx, prop in izip(count(), prop_list)] 62 | for row in reader: 63 | key = tuple(row[idx] for idx in key_indexes) 64 | counter = counts[key] 65 | for csvidx, propidx in csv_indexes: 66 | counter.increment(propidx, nullint(row[csvidx])) 67 | 68 | crashes_path = 'stability-rollups/{year}/{date}-crashes.csv.gz'.format( 69 | year=date.year, date=date.strftime('%Y%m%d')) 70 | csvheaders, reader = HeaderCSVReader( 71 | S3CompressedReader(default_bucket, crashes_path)) 72 | key_indexes = [csvheaders.index(prop) 73 | for prop in ('channel', 'buildid', 'os')] 74 | for row in reader: 75 | key = tuple(row[idx] for idx in key_indexes) 76 | counts[key].crashes += nullint(row[-1]) 77 | 78 | summary_path = 'stability-rollups/{year}/{date}-summary.json.gz'.format( 79 | year=date.year, date=date.strftime('%Y%m%d')) 80 | with S3CompressedWriter(default_bucket, summary_path) as fd: 81 | json.dump([c.final(channel=channel, buildid=buildid, os=os) 82 | for (channel, buildid, os), c in counts.iteritems()], fd) 83 | 84 | if __name__ == '__main__': 85 | import sys 86 | from datetime import date, timedelta 87 | start = date(2015, 11, 5) 88 | end = date(2015, 11, 30) 89 | for i in count(): 90 | d = start + timedelta(days=i) 91 | if d > end: 92 | break 93 | summarize(d) 94 | -------------------------------------------------------------------------------- /reports/stability-summary/utils.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from gzip import GzipFile 3 | from cStringIO import StringIO 4 | import sys 5 | import csv 6 | 7 | class S3CompressedWriter(object): 8 | def __init__(self, bucket, path, mimetype='text/plain'): 9 | self.bucket = bucket 10 | self.path = path 11 | self.mimetype = mimetype 12 | self._buffer = None 13 | 14 | def __enter__(self): 15 | self._buffer = StringIO(); 16 | self._writer = GzipFile(mode="wb", fileobj=self._buffer) 17 | return self._writer 18 | 19 | def __exit__(self, exc_type, exc_value, traceback): 20 | if exc_value is None: 21 | self._writer.close() 22 | self._buffer.seek(0) 23 | s3 = boto3.resource('s3') 24 | s3.Object(self.bucket, self.path).put(Body=self._buffer, ContentEncoding='gzip', ContentType=self.mimetype) 25 | self._buffer = None 26 | 27 | def __del__(self): 28 | assert self._buffer is None 29 | 30 | def S3CompressedReader(bucket, path): 31 | s3 = boto3.resource('s3') 32 | r = s3.Object(bucket, path).get() 33 | body = StringIO(r['Body'].read()) 34 | return GzipFile(mode="rb", fileobj=body) 35 | 36 | def HeaderCSVReader(fd, *args, **kwargs): 37 | """ 38 | Read CSV data from `fd`, separating the header list from the data. 39 | """ 40 | reader = csv.reader(fd, *args, **kwargs) 41 | header = reader.next() 42 | return header, reader 43 | --------------------------------------------------------------------------------