├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── TODO.md
├── __init__.py
├── analysis
    ├── __init__.py
    ├── analysis-worker-stack.yaml
    ├── downloader.py
    ├── example
    │   ├── processor
    │   └── test
    │   │   ├── input.txt
    │   │   ├── ss-ff-n-22.lzma
    │   │   ├── ss-ff-n-28.lzma
    │   │   └── test-processor.sh
    ├── helpers.py
    ├── launcher.py
    ├── makefile
    ├── manager.py
    ├── test-filter.json
    ├── utils.py
    └── worker.py
├── bin
    └── get_histogram_tools.sh
├── cmake
    ├── FindLZMA.cmake
    ├── FindProtobuf.cmake
    ├── doxygen.cmake
    ├── externals.cmake
    ├── mozsvc.cmake
    └── rapidjson-0_11.patch
├── docs
    ├── BagheeraIntegration.md
    ├── CompressionBenchmarks.md
    ├── Deduplication.md
    ├── MapReduce.md
    ├── PayloadFormat.md
    ├── ProcessIncoming.md
    ├── StorageFormat.md
    ├── StorageLayout.md
    ├── data_flow.png
    ├── data_flow.svg
    └── telemetry_logo.svg
├── http
    ├── __init__.py
    ├── analysis-service
    │   ├── analysis-resources.yaml
    │   ├── analysis-service-stack.yaml
    │   ├── config.py
    │   ├── crontab.py
    │   ├── jobs
    │   │   └── run.sh
    │   ├── makefile
    │   ├── requirements
    │   ├── server.py
    │   ├── static
    │   │   ├── cluster.png
    │   │   ├── schedule.png
    │   │   ├── style.css
    │   │   └── worker.png
    │   ├── templates
    │   │   ├── base.html
    │   │   ├── boot-script.sh
    │   │   ├── cluster
    │   │   │   ├── cluster.html
    │   │   │   ├── email.html
    │   │   │   ├── kill.html
    │   │   │   ├── monitor.html
    │   │   │   ├── schedule.html
    │   │   │   ├── schedule_create.html
    │   │   │   ├── schedule_delete.html
    │   │   │   └── schedule_files.html
    │   │   ├── index.html
    │   │   ├── instance-launched-email.html
    │   │   ├── kill.html
    │   │   ├── macros.html
    │   │   ├── monitor.html
    │   │   ├── schedule.html
    │   │   ├── schedule_create.html
    │   │   ├── schedule_delete.html
    │   │   ├── schedule_files.html
    │   │   └── worker.html
    │   ├── terminate-expired-instances.py
    │   └── user.py
    ├── histogram_server.py
    ├── relay.js
    ├── server.js
    └── server_config.json
├── mapreduce
    ├── __init__.py
    ├── addon_perf
    │   ├── README.md
    │   ├── addon-perf.json
    │   ├── addon-scan.json
    │   ├── addon-versions.py
    │   ├── addon_perf.py
    │   ├── combine.py
    │   ├── filter_template.json
    │   ├── package.sh
    │   ├── processAddonPerf.sh
    │   ├── run-fork.sh
    │   └── run.sh
    ├── chromehangs
    │   ├── chromehangs.py
    │   ├── combine.py
    │   ├── combine_week.sh
    │   ├── extract_common_stacks.py
    │   ├── filter_template.json
    │   ├── package.sh
    │   ├── run.sh
    │   ├── run_public.sh
    │   ├── symbolicate.py
    │   └── test_symbolicate.py
    ├── examples
    │   ├── heka
    │   │   ├── distribution.py
    │   │   ├── filter.json
    │   │   └── run.sh
    │   └── v2
    │   │   ├── dims_only.py
    │   │   ├── distribution.py
    │   │   ├── filter-nightly-buildid.json
    │   │   ├── filter_include_all.json
    │   │   ├── filter_max_buildid.json
    │   │   ├── filter_min_buildid.json
    │   │   ├── filter_min_max_buildid.json
    │   │   ├── filter_saved_session_Fx_prerelease.json
    │   │   ├── osdistribution.py
    │   │   ├── simple_counter.py
    │   │   └── trivial.py
    ├── experiments
    │   ├── experiments.py
    │   ├── filter_template.json
    │   ├── postprocess.py
    │   └── run.sh
    ├── flash
    │   ├── csv_header.txt
    │   ├── filter_template.json
    │   ├── flash_versions.py
    │   ├── package.sh
    │   └── run.sh
    ├── fxosping
    │   ├── csv_header.txt
    │   ├── filter_template.json
    │   ├── fxosping.py
    │   ├── package.sh
    │   └── run.sh
    ├── hekajob.py
    ├── job.py
    ├── loop_failure_summary
    │   ├── failures_by_type.py
    │   ├── filter_template.json
    │   ├── header.txt
    │   ├── run.sh
    │   └── summarize.py
    ├── mainthreadio
    │   ├── csv_header.txt
    │   ├── filter_template.json
    │   ├── mainthreadio.py
    │   ├── package.sh
    │   ├── run.sh
    │   └── summary.py
    └── slowsql
    │   ├── csv_header.txt
    │   ├── filter_template.json
    │   ├── package.sh
    │   ├── run.sh
    │   └── slowsql.py
├── mongodb
    ├── __init__.py
    ├── examples
    │   └── osdistribution.js
    └── importer.py
├── monitoring
    ├── __init__.py
    ├── anomaly_detection
    │   ├── detect.py
    │   └── notify.py
    ├── expire_flash_video
    │   ├── __init__.py
    │   └── expire_flash_video.py
    ├── heka
    │   ├── common.toml
    │   ├── incoming_stats.toml
    │   ├── lua_decoders
    │   │   ├── telemetry_incoming_stats.lua
    │   │   └── telemetry_server.lua
    │   ├── lua_filters
    │   │   ├── telemetry_channel_metrics.lua
    │   │   ├── telemetry_server_metrics.lua
    │   │   ├── telemetry_stats_bytes.lua
    │   │   ├── telemetry_stats_errors.lua
    │   │   └── telemetry_stats_records.lua
    │   └── server.toml
    ├── process_incoming
    │   ├── error_rates.py
    │   ├── sample_data
    │   │   ├── TelemetryStatsErrorsAggregator.bad_payload.cbuf
    │   │   ├── TelemetryStatsErrorsAggregator.conversion_error.cbuf
    │   │   ├── TelemetryStatsErrorsAggregator.corrupted_data.cbuf
    │   │   ├── TelemetryStatsErrorsAggregator.empty_data.cbuf
    │   │   ├── TelemetryStatsErrorsAggregator.invalid_path.cbuf
    │   │   ├── TelemetryStatsErrorsAggregator.missing_revision.cbuf
    │   │   ├── TelemetryStatsErrorsAggregator.missing_revision_repo.cbuf
    │   │   ├── TelemetryStatsErrorsAggregator.uuid_only_path.cbuf
    │   │   ├── TelemetryStatsErrorsAggregator.write_failed.cbuf
    │   │   └── TelemetryStatsRecordsAggregator.ReaderALL.cbuf
    │   └── viz
    │   │   ├── css
    │   │       ├── bootstrap.css
    │   │       ├── metrics-graphics-demo.css
    │   │       └── metrics-graphics.css
    │   │   ├── data
    │   │       └── errors.example.json
    │   │   ├── index.html
    │   │   └── js
    │   │       ├── main.js
    │   │       └── metrics-graphics.js
    ├── sanitize_fxos
    │   ├── __init__.py
    │   └── sanitize_fxos_pings.py
    └── telemetry.mozilla.org
    │   └── check_last_update.py
├── process_incoming
    ├── __init__.py
    ├── process_incoming_mp.py
    ├── process_incoming_serial.py
    ├── process_incoming_standalone.py
    └── worker
    │   ├── CMakeLists.txt
    │   ├── ConvertConfig.cpp
    │   ├── ConvertConfig.h
    │   ├── common
    │       ├── CMakeLists.txt
    │       ├── Common.h
    │       ├── CompressedFileWriter.cpp
    │       ├── CompressedFileWriter.h
    │       ├── HekaLogger.cpp
    │       ├── HekaLogger.h
    │       ├── HistogramCache.cpp
    │       ├── HistogramCache.h
    │       ├── HistogramConverter.cpp
    │       ├── HistogramConverter.h
    │       ├── HistogramSpecification.cpp
    │       ├── HistogramSpecification.h
    │       ├── Logger.h
    │       ├── Metric.cpp
    │       ├── Metric.h
    │       ├── RecordWriter.cpp
    │       ├── RecordWriter.h
    │       ├── TelemetryConstants.h
    │       ├── TelemetryConstants.in.cpp
    │       ├── TelemetryRecord.cpp
    │       ├── TelemetryRecord.h
    │       ├── TelemetrySchema.cpp
    │       ├── TelemetrySchema.h
    │       ├── message.pb.cc
    │       ├── message.pb.h
    │       ├── message.proto
    │       └── test
    │       │   ├── CMakeLists.txt
    │       │   ├── TestConfig.in.h
    │       │   ├── TestHistogramCache.cpp
    │       │   ├── TestHistogramConverter.cpp
    │       │   ├── TestHistogramSpecification.cpp
    │       │   ├── TestRecordWriter.cpp
    │       │   ├── TestTelemetryRecord.cpp
    │       │   ├── TestTelemetrySchema.cpp
    │       │   └── data
    │       │       ├── 8d3810543edc.json.FIREFOX_AURORA_24_BASE
    │       │       ├── a55c55edf302.json
    │       │       ├── a55c55edf302.json.FIREFOX_AURORA_23_BASE
    │       │       ├── ad0ae007aa9e.json.FIREFOX_AURORA_25_BASE
    │       │       ├── cache
    │       │           ├── 8d3810543edc.json
    │       │           ├── a55c55edf302.json
    │       │           └── ad0ae007aa9e.json
    │       │       ├── invalid.json
    │       │       ├── invalid_kind.json
    │       │       ├── invalid_schema.json
    │       │       ├── missing_kind.json
    │       │       ├── telemetry1.log
    │       │       └── telemetry_schema.json
    │   ├── convert.cpp
    │   └── convert.json
├── provisioning
    ├── __init__.py
    ├── ansible
    │   ├── README.md
    │   ├── envs
    │   │   ├── dev.yml
    │   │   └── dev_secrets.example.yml
    │   ├── hosts
    │   ├── playbooks
    │   │   ├── app.yml
    │   │   ├── build_ami.yml
    │   │   ├── make_code_package.yml
    │   │   ├── resources.yml
    │   │   └── route53.yml
    │   └── templates
    │   │   └── route53.json
    ├── aws
    │   ├── __init__.py
    │   ├── aws_incoming.example.json
    │   ├── aws_incoming.prod.json
    │   ├── aws_launcher.py
    │   ├── aws_telemetry_server_config.example.json
    │   ├── aws_telemetry_server_config.prod.json
    │   ├── aws_telemetry_server_config.prod_secondary.json
    │   ├── aws_util.py
    │   ├── create_ami.py
    │   ├── create_telemetry_base_ami.py
    │   ├── create_telemetry_worker_ami.py
    │   ├── launch_mapreduce_job.py
    │   ├── launch_telemetry_server.py
    │   ├── launch_worker.py
    │   ├── process_incoming_distributed.py
    │   ├── process_incoming_queue.py
    │   ├── telemetry_server_base.hvm.json
    │   ├── telemetry_server_base.pv.json
    │   └── telemetry_worker.hvm.json
    ├── cloudformation
    │   ├── telemetry-regression-alerts.json
    │   └── telemetry-server-stack.json
    └── config
    │   ├── boto.cfg
    │   └── telemetry_aws.prod.json
├── server
    └── server_config.spot.json
├── telemetry
    ├── __init__.py
    ├── convert.py
    ├── infoFieldsMap.py
    ├── persist.py
    ├── revision_cache.py
    ├── telemetry_schema.json
    ├── telemetry_schema.py
    ├── test_convert.py
    ├── test_persist.py
    ├── test_revision_cache.py
    ├── test_telemetry_schema.py
    └── util
    │   ├── __init__.py
    │   ├── bench.py
    │   ├── benchmark_server.py
    │   ├── bucket_list.py
    │   ├── cf-yaml-helper.py
    │   ├── compress.py
    │   ├── convert_local_pings.py
    │   ├── convert_log_v0_to_v1.py
    │   ├── export.py
    │   ├── files.py
    │   ├── heka_message.py
    │   ├── heka_message_parser.py
    │   ├── lists.py
    │   ├── message_pb2.py
    │   ├── pack_log.py
    │   ├── s3.py
    │   ├── split_raw_log.py
    │   ├── test_compress.py
    │   ├── test_downloader.py
    │   ├── timer.py
    │   └── unpack_log.py
└── test
    ├── test.txt.gz
    ├── test.txt.lzma
    ├── test.txt.xz
    └── unicode.v1.packed


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.gz
 3 | *.bz2
 4 | *.xz
 5 | *.lzma
 6 | *.swp
 7 | *.swo
 8 | *.out
 9 | *.o
10 | *~
11 | .idea/
12 | histogram_cache/
13 | http/analysis-service/analysis-resources.json
14 | http/analysis-service/analysis-service-stack.json
15 | http/analysis-service/telemetry-analysis-service.tar.gz
16 | node_modules/
17 | htmlcov/
18 | histogram_tools.py
19 | .DS_Store
20 | CMakeFiles
21 | provisioning/ansible/envs/dev_secrets.yml
22 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
 6 | project(telemetry)
 7 | set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "telemetry-server")
 8 | set(CPACK_PACKAGE_VERSION_MAJOR 0)
 9 | set(CPACK_PACKAGE_VERSION_MINOR 1)
10 | set(CPACK_PACKAGE_VERSION_PATCH 0)
11 | 
12 | set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
13 | include(mozsvc)
14 | include(externals)
15 | 
16 | find_package (Threads)
17 | find_package(ZLIB REQUIRED)
18 | find_package(LZMA REQUIRED)
19 | find_package(OpenSSL REQUIRED)
20 | find_package(Protobuf 2.3 REQUIRED)
21 | find_package(Boost 1.54.0 REQUIRED log filesystem system thread unit_test_framework regex)
22 | 
23 | include_directories(${Boost_INCLUDE_DIRS} ${OPENSSL_INCLUDE_DIR} "${CMAKE_SOURCE_DIR}/process_incoming/worker/common")
24 | 
25 | add_subdirectory(process_incoming/worker)
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | http://www.mozilla.org/MPL/2.0/index.txt
2 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | TODO
 2 | ====
 3 | 
 4 | - [P2] Add "number of records" to exported filenames
 5 | - [P2] Figure out idle-daily de-duplication
 6 | - [P2] Supply the correct Histograms.json spec for each record to the Mapper
 7 | - [P2] MapReduce: delete downloaded data files after they have been processed.
 8 | - [P2] Improve speed of the conversion process
 9 | - [P3] Have the "process_incoming" job write bad input records back to S3
10 | - [P3] Stream data from S3 for MapReduce instead of downloading first
11 | - [P3] Add timeout/retry around fetching Histograms.json from hg.mozilla.org
12 | - [P3] Add many tests
13 | - [P3] Add runtime performance metrics
14 | - [P3] Ensure things are in order to accept Addon Histograms, such as
15 |        from [pdf.js][5]
16 | - [P4] Change the RevisionCache to fetch the entire history of Histograms.json
17 |        and then convert incoming revisions to times to find the right version
18 | 
19 | [1]: https://github.com/Cue/scales "Scales"
20 | [2]: http://docs.python.org/2/library/logging.html "Python Logging"
21 | [3]: http://docs.python.org/2/library/profile.html "Python Profilers"
22 | [5]: https://github.com/mozilla/pdf.js/pull/3532/files#L1R29
23 | [7]: http://docs.aws.amazon.com/AmazonS3/latest/dev/object-lifecycle-mgmt.html
24 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/__init__.py


--------------------------------------------------------------------------------
/analysis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/analysis/__init__.py


--------------------------------------------------------------------------------
/analysis/downloader.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Process
 2 | from boto.s3.connection import S3Connection
 3 | from traceback import print_exc
 4 | from utils import mkdirp
 5 | import os, sys
 6 | 
 7 | class DownloaderProcess(Process):
 8 |     """ Worker process that download files from queue to folder """
 9 |     def __init__(self, input_queue, output_queue,
10 |                        work_folder, aws_cred):
11 |         super(DownloaderProcess, self).__init__()
12 |         self.input_queue = input_queue
13 |         self.output_queue = output_queue
14 |         self.work_folder = work_folder
15 |         mkdirp(self.work_folder)
16 |         self.input_bucket = "telemetry-published-v2"
17 |         self.aws_cred = aws_cred
18 |         self.s3 = S3Connection(**self.aws_cred)
19 |         self.bucket = self.s3.get_bucket(self.input_bucket, validate = False)
20 | 
21 |     def run(self):
22 |         while True:
23 |             prefix = self.input_queue.get()
24 |             self.download(prefix)
25 | 
26 |     def download(self, prefix):
27 |         # Get filename from prefix
28 |         filename = os.path.basename(prefix)
29 |         # Get target path
30 |         target = os.path.join(self.work_folder, filename)
31 |         # Download file
32 |         retries = 1
33 |         success = False
34 |         while retries < 3:
35 |             try:
36 |                 k = self.bucket.get_key(prefix)
37 |                 k.get_contents_to_filename(target)
38 |                 success = True
39 |                 break
40 |             except:
41 |                 retries += 1
42 |                 print >> sys.stderr, "Error on %i'th try:" % retries
43 |                 print_exc(file = sys.stderr)
44 | 
45 |         if success:
46 |             # Put file to output query
47 |             self.output_queue.put((prefix, target))
48 |         else:
49 |             print >> sys.stderr, "Failed to download: %s" % prefix
50 |             self.output_queue.put((prefix, None))
51 | 


--------------------------------------------------------------------------------
/analysis/example/processor:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Number of rows counted
 4 | rows=0;
 5 | 
 6 | # Read stdin line by line
 7 | while read -r line; do
 8 | 
 9 |   # Skip empty-lines, last line may be empty
10 |   if [ "$line" == "" ]; then
11 |     continue;
12 |   fi;
13 | 
14 |   # Split input
15 |   prefix=`echo "$line" | cut -f 1`;
16 |   path=`echo "$line" | cut -f 2`;
17 | 
18 |   # Count number of rows
19 |   new_rows=`xz -dc $path | wc -l`;
20 |   rows=$(($rows + $new_rows));
21 | 
22 |   # Delete input file
23 |   rm $path;
24 | done;
25 | 
26 | # Output aggregated values to files in folder provided as first argument
27 | echo "$rows" > $1/rows_counted.txt;
28 | 


--------------------------------------------------------------------------------
/analysis/example/test/input.txt:
--------------------------------------------------------------------------------
1 | saved_session/Firefox/nightly/22.0a1/20130226031002.20131011.v2.log.e28a4032eb744f089a1828ac7399e5d8.lzma	input/ss-ff-n-22.lzma
2 | saved_session/Firefox/nightly/28.0a1/20131029030201.20131029.v2.log.0ab8723b6fb3455bb34b04d97482fda2.lzma	input/ss-ff-n-28.lzma
3 | 


--------------------------------------------------------------------------------
/analysis/example/test/ss-ff-n-22.lzma:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/analysis/example/test/ss-ff-n-22.lzma


--------------------------------------------------------------------------------
/analysis/example/test/ss-ff-n-28.lzma:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/analysis/example/test/ss-ff-n-28.lzma


--------------------------------------------------------------------------------
/analysis/example/test/test-processor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 3 | 
 4 | # This tests a processor, and takes a processor job_bundle as input argument
 5 | # that is a tarball containing a script called `processor` which is to be given
 6 | # files as input and output results into a single file
 7 | 
 8 | echo "### Setting up test environment";
 9 | 
10 | # Create test-folders
11 | mkdir -p test-folder/input;
12 | mkdir -p test-folder/output;
13 | 
14 | # Copy in job_bundle
15 | cp $1 test-folder/job_bundle.tar.gz
16 | 
17 | # Copy in test files
18 | cp $DIR/ss-ff-n-22.lzma test-folder/input/ss-ff-n-22.lzma
19 | cp $DIR/ss-ff-n-28.lzma test-folder/input/ss-ff-n-28.lzma
20 | 
21 | # Extract job_bundle
22 | cd test-folder;
23 | tar -xzf job_bundle.tar.gz;
24 | 
25 | # Run tests
26 | echo "### Running processor";
27 | cat $DIR//input.txt | ./processor output/;
28 | 
29 | echo "### Files produced";
30 | find output/;
31 | 
32 | if [ `ls input/ | wc -l` -ne "0" ]; then
33 |   echo "### WARNING";
34 |   echo "Input files where not deleted, please do this as they are consumed.";
35 | fi; 
36 | 


--------------------------------------------------------------------------------
/analysis/helpers.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import simplejson as json
 3 | except ImportError:
 4 |     import json
 5 | from subprocess import Popen, PIPE
 6 | from traceback import print_exc
 7 | import sys
 8 | 
 9 | def decompress_input(process):
10 |     def wrapper(self, prefix, path):
11 |         # Find dimensions
12 |         dims = prefix.split('/')
13 |         dims += dims.pop().split('.')[:2]
14 | 
15 |         # Open a compressor
16 |         raw_handle = open(path, "rb")
17 |         decompressor = Popen(
18 |             ['xz', '-d', '-c'],
19 |             bufsize = 65536,
20 |             stdin = raw_handle,
21 |             stdout = PIPE,
22 |             stderr = sys.stderr
23 |         )
24 | 
25 |         # Process each line
26 |         line_nb = 0
27 |         errors = 0
28 |         for line in decompressor.stdout:
29 |             line_nb += 1
30 |             try:
31 |                 uid, payload = line.split("\t", 1)
32 |                 process(self, uid, dims, payload)
33 |             except:
34 |                 print >> sys.stderr, ("Bad input line: %i of %s" %
35 |                                       (line_nb, prefix))
36 |                 print_exc(file = sys.stderr)
37 |                 errors += 1
38 | 
39 |         # Close decompressor
40 |         decompressor.stdout.close()
41 |         raw_handle.close()
42 | 
43 |         # Return number of failed records
44 |         return errors
45 |     return wrapper
46 | 
47 | def parse_input(process):
48 |     def wrapper(self, uid, dimensions, payload):
49 |         process(self, uid, dimensions, json.loads(payload))
50 |     return decompress_input(wrapper)
51 | 
52 | class Processor:
53 |     def __init__(self, output_folder):
54 |         self.output_folder = output_folder
55 | 
56 |     def process(self, prefix, path):
57 |                  # Raise exception on critical crash error
58 |                  # Print errors to stderr
59 |         return 0 # number of errors (rows we had problems parsing)
60 | 
61 |     @decompress_input
62 |     def process(self, uid, dimensions, payload):
63 |         pass    # Raise exception on error
64 | 
65 |     @parse_input
66 |     def process(self, uid, dimensions, json):
67 |         pass    # Raise exception on error
68 | 
69 |     def flush(self):
70 |         pass
71 | 


--------------------------------------------------------------------------------
/analysis/makefile:
--------------------------------------------------------------------------------
 1 | CFYAML = ../telemetry/util/cf-yaml-helper.py
 2 | SOURCES_BUCKET = jonasfj-telemetry-code
 3 | VERSION = 1
 4 | 
 5 | analysis-worker-stack.json: analysis-worker-stack.yaml
 6 | 	$(CFYAML) $< > $@
 7 | 
 8 | put: analysis-worker-stack.json
 9 | 	aws s3 cp analysis-worker-stack.json s3://$(SOURCES_BUCKET)/v$(VERSION)/analysis-worker-stack.json
10 | 
11 | clean:
12 | 	rm -f analysis-worker-stack.json
13 | 
14 | .PHONY: put
15 | 


--------------------------------------------------------------------------------
/analysis/test-filter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["saved_session"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": "*"
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": ["release", "aurora", "nightly", "beta", "nightly-ux"]
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |      "allowed_values": "*"
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": ["20131013"]
27 |     }
28 |   ]
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/analysis/utils.py:
--------------------------------------------------------------------------------
 1 | from errno import EEXIST
 2 | from multiprocessing import active_children, current_process
 3 | import os
 4 | 
 5 | def mkdirp(path):
 6 |     try:
 7 |         os.makedirs(path)
 8 |     except OSError as e:
 9 |         if e.errno != EEXIST or not os.path.isdir(path):
10 |             raise
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/bin/get_histogram_tools.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | wget -c https://hg.mozilla.org/mozilla-central/raw-file/6dc53d54f027/toolkit/components/telemetry/histogram_tools.py -O histogram_tools.py
4 | 


--------------------------------------------------------------------------------
/cmake/FindLZMA.cmake:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | # The module defines the following variables
 6 | #   LZMA_INCLUDE_DIR
 7 | #   LZMA_LIBRARIES
 8 | #   LZMA_FOUND
 9 | 
10 | IF (LZMA_INCLUDE_DIR)
11 |   SET(LZMA_FIND_QUIETLY TRUE)
12 | ENDIF (LZMA_INCLUDE_DIR)
13 | 
14 | FIND_PATH(LZMA_INCLUDE_DIR lzma.h)
15 | FIND_LIBRARY(LZMA_LIBRARY NAMES lzma )
16 | 
17 | INCLUDE(FindPackageHandleStandardArgs)
18 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(LZMA DEFAULT_MSG LZMA_LIBRARY LZMA_INCLUDE_DIR)
19 | 
20 | IF(LZMA_FOUND)
21 |   SET( LZMA_LIBRARIES ${LZMA_LIBRARY} )
22 | ELSE(LZMA_FOUND)
23 |   SET( LZMA_LIBRARIES )
24 | ENDIF(LZMA_FOUND)
25 | 
26 | MARK_AS_ADVANCED(LZMA_LIBRARY LZMA_INCLUDE_DIR)
27 | 


--------------------------------------------------------------------------------
/cmake/FindProtobuf.cmake:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | # The module defines the following variables:
 6 | #   PROTOBUF_FOUND - true if the Protobuf was found
 7 | #   PROTOBUF_EXECUTABLE - path to the executable
 8 | #   PROTOBUF_VERSION - Protobuf version number
 9 | #   PROTOBUF_LIBRARIES
10 | #   PROTOBUF_INCLUDE_DIR
11 | # Example usage:
12 | #   find_package(Protobuf 2.3 REQUIRED)
13 | 
14 | 
15 | find_program(PROTOBUF_EXECUTABLE protoc PATH_SUFFIXES bin)
16 | if (PROTOBUF_EXECUTABLE)
17 |     execute_process(COMMAND ${PROTOBUF_EXECUTABLE} --version OUTPUT_VARIABLE PROTOBUF_VERSION_OUTPUT OUTPUT_STRIP_TRAILING_WHITESPACE)
18 |     if(PROTOBUF_VERSION_OUTPUT MATCHES "libprotoc ([0-9]+\\.[0-9]+\\.[0-9]+)")
19 |         set(PROTOBUF_VERSION ${CMAKE_MATCH_1})
20 |     endif()
21 | endif()
22 | mark_as_advanced(PROTOBUF_EXECUTABLE)
23 | 
24 | find_path(PROTOBUF_INCLUDE_DIR NAMES "google/protobuf/stubs/common.h" )
25 | find_library(PROTOBUF_LIBRARIES NAMES protobuf)
26 | include(FindPackageHandleStandardArgs)
27 | find_package_handle_standard_args(Protobuf
28 | REQUIRED_VARS PROTOBUF_EXECUTABLE PROTOBUF_VERSION PROTOBUF_LIBRARIES PROTOBUF_INCLUDE_DIR
29 | VERSION_VAR PROTOBUF_VERSION)
30 | 


--------------------------------------------------------------------------------
/cmake/doxygen.cmake:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | find_package(Doxygen QUIET)
 6 | if(DOXYGEN_FOUND)
 7 |     set(DOXYCONF_IN  ${CMAKE_SOURCE_DIR}/doxygen.in.conf)
 8 |     set(DOXYCONF_OUT ${CMAKE_BINARY_DIR}/doxygen.conf)
 9 |     if(EXISTS ${DOXYCONF_IN})
10 |         configure_file(${DOXYCONF_IN} ${DOXYCONF_OUT})
11 |     else()
12 |         file(WRITE ${DOXYCONF_OUT} "
13 | PROJECT_NAME        = \"${PROJECT_NAME}\"
14 | PROJECT_BRIEF       = \"${CPACK_PACKAGE_DESCRIPTION_SUMMARY}\"
15 | OUTPUT_DIRECTORY    = docs
16 | GENERATE_LATEX      = NO
17 | GENERATE_TODOLIST   = YES
18 | FULL_PATH_NAMES     = YES
19 | STRIP_FROM_PATH     = \"${CMAKE_SOURCE_DIR}\"
20 | SOURCE_BROWSER      = YES
21 | TAB_SIZE            = 4
22 | EXTRACT_ALL         = YES
23 | JAVADOC_AUTOBRIEF   = YES
24 | RECURSIVE           = YES
25 | INPUT               = \"${CMAKE_SOURCE_DIR}\"
26 | EXCLUDE_PATTERNS    = \"${CMAKE_SOURCE_DIR}/.*\" \"${CMAKE_SOURCE_DIR}/debug*\" \"${CMAKE_SOURCE_DIR}/release*\"
27 | EXAMPLE_PATH        = ${EXAMPLE_PATHS}
28 | IMAGE_PATH          = ${IMAGE_PATHS}
29 | BUILTIN_STL_SUPPORT = YES
30 | STRIP_CODE_COMMENTS = NO
31 | SHOW_DIRECTORIES    = YES
32 | PROJECT_NUMBER      = ${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
33 |     endif()
34 | 
35 |     add_custom_target(docs ${DOXYGEN_EXECUTABLE} ${DOXYCONF_OUT})
36 | else()
37 |     message("Doxygen was not found, the documentation pages will not be generated")
38 | endif()
39 | 


--------------------------------------------------------------------------------
/cmake/externals.cmake:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | include(ExternalProject)
 6 | set_property(DIRECTORY PROPERTY EP_BASE "${CMAKE_BINARY_DIR}/ep_base")
 7 | find_program(PATCH_EXECUTABLE patch)
 8 | if (NOT PATCH_EXECUTABLE)
 9 |    message(FATAL_ERROR "patch not found")
10 | endif()
11 | 
12 | externalproject_add(
13 |     rapidjson-0_11
14 |     URL http://rapidjson.googlecode.com/files/rapidjson-0.11.zip
15 |     URL_MD5 96a4b1b57ece8bc6a807ceb14ccaaf94
16 |     PATCH_COMMAND ${PATCH_EXECUTABLE} -p1 < ${CMAKE_CURRENT_LIST_DIR}/rapidjson-0_11.patch
17 |     CONFIGURE_COMMAND ""
18 |     BUILD_COMMAND ""
19 |     INSTALL_COMMAND ""
20 | )
21 | 
22 | set(RAPIDJSON_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/ep_base/Source/rapidjson-0_11/include")
23 | include_directories(${RAPIDJSON_INCLUDE_DIRS})
24 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -isystem ${RAPIDJSON_INCLUDE_DIRS}")
25 | 
26 | 


--------------------------------------------------------------------------------
/cmake/mozsvc.cmake:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | if(MSVC)
 6 |     # Predefined Macros: http://msdn.microsoft.com/en-us/library/b0084kay.aspx
 7 |     # Compiler options: http://msdn.microsoft.com/en-us/library/fwkeyyhe.aspx
 8 | 
 9 |     # set a high warning level and treat them as errors
10 |     set(CMAKE_C_FLAGS           "/W3 /WX")
11 | 
12 |     # enable C++ exception handling
13 |     set(CMAKE_CXX_FLAGS         "${CMAKE_C_FLAGS} /EHsc")
14 | 
15 |     # debug multi threaded dll runtime, complete debugging info, runtime error checking
16 |     set(CMAKE_C_FLAGS_DEBUG     "/MDd /Zi /RTC1")
17 |     set(CMAKE_CXX_FLAGS_DEBUG   ${CMAKE_C_FLAGS_DEBUG})
18 | 
19 |     # multi threaded dll runtime, optimize for speed, auto inlining
20 |     set(CMAKE_C_FLAGS_RELEASE   "/MD /O2 /Ob2 /DNDEBUG")
21 |     set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
22 | 
23 |     set(CPACK_GENERATOR         "NSIS")
24 | else()
25 |     # Predefined Macros: clang|gcc -dM -E -x c /dev/null
26 |     # Compiler options: http://gcc.gnu.org/onlinedocs/gcc/Invoking-GCC.html#Invoking-GCC
27 |     set(CMAKE_C_FLAGS   "-std=c11 -pedantic -Werror -Wno-error=deprecated -Wall -Wextra -fPIC")
28 |     set(CMAKE_CXX_FLAGS "-std=c++11 -pedantic -Werror -Wno-error=deprecated -Wall -Wextra -fPIC -isystem /usr/local/include -isystem /opt/local/include")
29 |     set(CMAKE_C_FLAGS_DEBUG     "-g")
30 |     set(CMAKE_CXX_FLAGS_DEBUG   ${CMAKE_C_FLAGS_DEBUG})
31 | 
32 |     set(CMAKE_C_FLAGS_RELEASE   "-O2 -DNDEBUG")
33 |     set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
34 | 
35 |     set(CMAKE_C_FLAGS_PROFILE   "${CMAKE_C_FLAGS_RELEASE} -g -pg")
36 |     set(CMAKE_CXX_FLAGS_PROFILE ${CMAKE_C_FLAGS_PROFILE})
37 | 
38 |     set(CPACK_GENERATOR         "TGZ")
39 | 
40 |     set(CMAKE_SKIP_BUILD_RPATH              FALSE)
41 |     set(CMAKE_BUILD_WITH_INSTALL_RPATH      FALSE)
42 |     set(CMAKE_INSTALL_RPATH_USE_LINK_PATH   FALSE)
43 | endif()
44 | 
45 | set(CPACK_PACKAGE_VENDOR        "Mozilla Services")
46 | set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/LICENSE")
47 | include(CPack)
48 | include(CTest)
49 | 
50 | set(Boost_USE_STATIC_LIBS    ON)
51 | set(Boost_USE_MULTITHREADED  ON)
52 | set(Boost_USE_STATIC_RUNTIME OFF)
53 | 
54 | include(doxygen)
55 | 


--------------------------------------------------------------------------------
/docs/BagheeraIntegration.md:
--------------------------------------------------------------------------------
 1 | Bagheera Integration
 2 | ====================
 3 | 
 4 | Production Telemetry Data is submitted to a [Bagheera][1] server.
 5 | 
 6 | Bagheera is a REST service that accepts submissions via HTTP.
 7 | 
 8 | As of version 0.11, submissions are first saved to a Kafka queue, which is then
 9 | processed by one or more Kafka consumers.
10 | 
11 | As a preliminary way to integrate the prototype telemetry server with the
12 | existing pipeline, we use the [KafkaReplayConsumer][2] to replay the
13 | production requests against the prototype server.
14 | 
15 | This results in no data loss of production data, and an optionally-sampled
16 | stream of data being directed to a second server.
17 | 
18 | The simple approach ro running such a replay consumer would be to use the
19 | packaged `consumer` script distributed with [Bagheera][1] with a command like:
20 | 
21 | ```bash
22 | # Set variables
23 | export BAGHEERA_HOME=/path/to/bagheera
24 | export KAFKA_TOPIC=my_topic
25 | export KAFKA_GID=replay_${KAFKA_TOPIC}_20130624 # ensure the gid is unique to this consumer!
26 | export REPLAY_HOST=www.example.com
27 | export SAMPLE_RATE=0.01 # use '1' to replay all requests, or a float less than one to sample.
28 | 
29 | # Run the command
30 | sudo -u bagheera_user $BAGHEERA_HOME/bin/consumer com.mozilla.bagheera.consumer.KafkaReplayConsumer \
31 |  -t $KAFKA_TOPIC \
32 |  -gid $KAFKA_GID \
33 |  -p $BAGHEERA_HOME/conf/kafka.consumer.properties \
34 |  --copy-keys true \
35 |  --dest "http://$REPLAY_HOST/submit/telemetry/%k" \
36 |  --sample $SAMPLE_RATE \
37 |  --delete false
38 | ```
39 | 
40 | In the case where your network security does not allow outbound HTTP requests,
41 | you may need to specify an HTTP proxy.  This can easily be done at the JVM
42 | level, so we can invoke the full command manually.  We end up with:
43 | 
44 | ```bash
45 | export PROXY_HOST=example.proxy.mozilla.com
46 | export PROXY_PORT=9999
47 | sudo -u bagheera_user java \
48 |  -Dhttp.proxyHost=$PROXY_HOST \
49 |  -Dhttp.proxyPort=$PROXY_PORT \
50 |  ...<snip long list of JVM args copied from the 'consumer' script>... \
51 |  -cp <snip long classpath> \
52 |  com.mozilla.bagheera.consumer.KafkaReplayConsumer \
53 |  -t $KAFKA_TOPIC \
54 |  -gid $KAFKA_GID \
55 |  -p $BAGHEERA_HOME/conf/kafka.consumer.properties \
56 |  --copy-keys true \
57 |  --dest "http://$REPLAY_HOST/submit/telemetry/%k" \
58 |  --sample $SAMPLE_RATE \
59 |  --delete false
60 | ```
61 | 
62 | [1]: https://github.com/mozilla-metrics/bagheera "Bagheera"
63 | [2]: https://github.com/mozilla-metrics/bagheera/blob/master/src/main/java/com/mozilla/bagheera/consumer/KafkaReplayConsumer.java "KafkaReplayConsumer"
64 | 


--------------------------------------------------------------------------------
/docs/CompressionBenchmarks.md:
--------------------------------------------------------------------------------
 1 | Compression Benchmarks
 2 | ======================
 3 | 
 4 | One of the important considerations here is what compression format
 5 | to use, and within that format, what level of compression to use.
 6 | 
 7 | After a round of testing that is largely lost to the sands of time,
 8 | we settled on the LZMA / XZ format.
 9 | 
10 | Some statistics on the time vs. space characteristics of various
11 | compression levels can be found at [compression notes][1].
12 | 
13 | To run a real-world test, you can use some code like the following:
14 | 
15 |     aws s3 cp s3://telemetry-published-bucket/path/to/sample_file.lzma ./
16 |     lzma -d sample_file.lzma
17 |     for level in $(seq 0 9); do
18 |       echo "compressing with level $level"
19 |       time cat sample_file | lzma -${level} > test$level.lzma
20 |       ls -l test$level.lzma
21 |     done &>> comptest.log
22 | 
23 | Using a ~500MB raw input file on a `c3.large` EC2 node, this gives
24 | a result like:
25 | 
26 | 
27 |      Level     Time       Size      Filename
28 |     -------  ---------  ---------  ----------
29 |     level 0  0m26.176s  105830359  test0.lzma
30 |     level 1  0m28.231s   89387336  test1.lzma
31 |     level 2  0m37.868s   81364589  test2.lzma
32 |     level 3  0m52.852s   76801476  test3.lzma
33 |     level 4  1m40.807s   73784033  test4.lzma
34 |     level 5  2m36.868s   65191241  test5.lzma
35 |     level 6  3m39.400s   61367748  test6.lzma
36 |     level 7   4m1.284s   60218864  test7.lzma
37 |     level 8  4m19.748s   59183316  test8.lzma
38 |     level 9  4m47.116s   58338421  test9.lzma
39 | 
40 | 
41 | Using `xz` instead of `lzma` gives nearly identical numbers, but `xz` is to be
42 | preferred since those files can be concatenated without having to decompress
43 | and compress again.
44 | 
45 | [1]: https://docs.google.com/spreadsheet/pub?key=0AoRU282jPz57dFBuX0pZX25NNVRlU3lQTDZUVzlEUEE&output=html
46 | 


--------------------------------------------------------------------------------
/docs/ProcessIncoming.md:
--------------------------------------------------------------------------------
 1 | Architecture for Processing Incoming Data
 2 | =========================================
 3 | 
 4 | Let `N` denote the number of CPU cores available on the processing machine.
 5 | 
 6 | The `Server` Process:
 7 | ---------------------
 8 | *  Create following folders on same storage device
 9 |     * `downloading/`  for files while being downloaded,
10 |     * `upload/`       for files ready for upload
11 |     * `uploading/`    for files being uploaded
12 |     * `incoming/`     for incoming files that have been downloaded
13 | *  Start `N / 2` instances of the `Download` process
14 | *  Start `N / 2` instances of the `Upload` process
15 | *  For `i = 0` to `N` do:
16 |     * Create folders `work-i/`, `input-i/`, `log-i/`
17 |     * Start `Worker` process (given it a reference to `i`)
18 | 
19 | The `Download` Process:
20 | -----------------------
21 | * While number of files in the `incoming` folder is less than `N`.
22 |     * Download a new raw telemetry log file to `downloading/`
23 |     * Move downloaded file from `downloading/` to `incoming/`
24 | 
25 | The `Upload` Process:
26 | ---------------------
27 | * While the `upload/` contains files:
28 |     * Move a file from `upload/` to `uploading/`
29 |     * Upload file to S3
30 |     * Delete from file from `uploading/`
31 | 
32 | The `Worker` Process i:
33 | -----------------------
34 | * While `incoming` is non-empty:
35 |     * Move file from `incoming/` to `input-i/`
36 |     * For each line in file:
37 |         * Parse line giving us path and histogram
38 |         * if parse error
39 |             * Write to somewhere in `log-i/`
40 |             * Skip line
41 |         * Convert histogram
42 |         * RecordWriter.write(path, historgram.serilize())
43 |     * Delete input file
44 |     * If SIGHUP has been seen:
45 |         * close all files and compressor context in HashTable
46 |         * Compress files and move them to `upload/`
47 | * On SIGHUP: Raise a boolean flag.
48 | 
49 | 
50 | The idea with WorkerProcess:
51 | ----------------------------
52 | * We can stop it at anytime and upload (by sending it a SIGHUP)
53 | * We can keep it running and feed it data until it produces big files (worst
54 |   case, one file per day for a given set of partitions)
55 | * We can tweak number of compression contexts, reducing intermediate disk I/O
56 |   in exchange for increased memory usage
57 | * If we crash, uncompressed files from `work-i/` can be compressed and uploaded
58 | * If we crash, offending `incoming` file is located in `input-i/` this can be
59 |   uploaded for tests (not for reapplication if we do previous thing)
60 | * Both conversion and compression happens in WorkerProcess, so we can't fill up
61 |   a pipe somewhere and have IPC problems
62 | * Problem with conversion and compression in same process if that if conversion
63 |   crashes, partially compressed data is corrupt
64 | 


--------------------------------------------------------------------------------
/docs/data_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/docs/data_flow.png


--------------------------------------------------------------------------------
/http/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/http/__init__.py


--------------------------------------------------------------------------------
/http/analysis-service/config.py:
--------------------------------------------------------------------------------
 1 | SECRET_KEY         = 'Overwrite with a secret on deployment'
 2 | 
 3 | # AWS EC2 configuration
 4 | AWS_REGION         = 'us-west-2'
 5 | INSTANCE_TYPE      = 'c3.4xlarge'
 6 | WORKER_AMI         = 'ami-0057b733' # -> telemetry-worker-hvm-20151019 (Ubuntu 15.04)
 7 | WORKER_PRIVATE_PROFILE = 'telemetry-example-profile'
 8 | WORKER_PUBLIC_PROFILE  = 'telemetry-example-profile'
 9 | 
10 | # EMR configuration
11 | # Master and slave instance types should be the same as the telemetry
12 | # setup bootstrap action depends on it to autotune the cluster.
13 | MASTER_INSTANCE_TYPE = INSTANCE_TYPE
14 | SLAVE_INSTANCE_TYPE = INSTANCE_TYPE
15 | EMR_RELEASE = 'emr-4.5.0'
16 | SPARK_INSTANCE_PROFILE = 'telemetry-example-profile'
17 | SPARK_EMR_BUCKET = 'example'
18 | 
19 | # Make sure the ephemeral map matches the instance type above.
20 | EPHEMERAL_MAP      = { "/dev/xvdb": "ephemeral0", "/dev/xvdc": "ephemeral1" }
21 | SECURITY_GROUPS    = []
22 | INSTANCE_PROFILE   = 'telemetry-analysis-profile'
23 | INSTANCE_APP_TAG   = 'telemetry-analysis-worker-instance'
24 | EMAIL_SOURCE       = 'telemetry-alerts@mozilla.com'
25 | 
26 | # Tags for accounting purposes
27 | ACCOUNTING_APP_TAG  = 'telemetry-analysis'
28 | ACCOUNTING_TYPE_TAG = 'worker'
29 | 
30 | # Buckets for storing S3 data
31 | TEMPORARY_BUCKET   = 'bucket-for-ssh-keys'
32 | CODE_BUCKET        = 'telemetry-analysis-code-2'
33 | PUBLIC_DATA_BUCKET = 'telemetry-public-analysis-2'
34 | PRIVATE_DATA_BUCKET = 'telemetry-private-analysis-2'
35 | 


--------------------------------------------------------------------------------
/http/analysis-service/makefile:
--------------------------------------------------------------------------------
 1 | CFYAML = ../../telemetry/util/cf-yaml-helper.py
 2 | 
 3 | SOURCES_BUCKET = telemetry-analysis-code-2
 4 | VERSION = 21
 5 | 
 6 | FILES = $(shell find * -name "*.py") \
 7 | 		$(shell find * -name "*.sh") \
 8 | 		$(shell find * -name "*.css") \
 9 | 		$(shell find * -name "*.png") \
10 | 		$(shell find * -name "*.html")
11 | 
12 | telemetry-analysis-service.tar.gz: $(FILES)
13 | 	tar -czf $@ $^
14 | 
15 | analysis-service-stack.json: analysis-service-stack.yaml
16 | 	$(CFYAML) $< > $@
17 | 
18 | analysis-resources.json: analysis-resources.yaml
19 | 	$(CFYAML) $< > $@
20 | 
21 | put: telemetry-analysis-service.tar.gz analysis-service-stack.json
22 | 	aws s3 cp telemetry-analysis-service.tar.gz s3://$(SOURCES_BUCKET)/v$(VERSION)/telemetry-analysis-service.tar.gz
23 | 	aws s3 cp analysis-service-stack.json s3://$(SOURCES_BUCKET)/v$(VERSION)/analysis-service-stack.json
24 | 
25 | clean:
26 | 	rm -f telemetry-analysis-service.tar.gz analysis-service-stack.json
27 | 
28 | .PHONY: put
29 | 


--------------------------------------------------------------------------------
/http/analysis-service/requirements:
--------------------------------------------------------------------------------
 1 | boto==2.38.0
 2 | botocore==1.3.9
 3 | docutils==0.12
 4 | Flask==0.10.1
 5 | Flask-BrowserID==0.0.4
 6 | Flask-Login==0.3.2
 7 | futures==2.2.0
 8 | itsdangerous==0.24
 9 | Jinja2==2.8
10 | jmespath==0.9.0
11 | MarkupSafe==0.23
12 | python-dateutil==2.4.2
13 | requests==2.8.1
14 | six==1.10.0
15 | SQLAlchemy==1.0.9
16 | Werkzeug==0.11.2
17 | 


--------------------------------------------------------------------------------
/http/analysis-service/static/cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/http/analysis-service/static/cluster.png


--------------------------------------------------------------------------------
/http/analysis-service/static/schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/http/analysis-service/static/schedule.png


--------------------------------------------------------------------------------
/http/analysis-service/static/style.css:
--------------------------------------------------------------------------------
 1 | /* This Source Code Form is subject to the terms of the Mozilla Public
 2 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 3 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 4 | 
 5 | img {
 6 |     vertical-align: middle;
 7 | }
 8 | 
 9 | table {
10 |     border-collapse: collapse;
11 | }
12 | 
13 | tr:nth-child(odd) {
14 |     background-color: #eee;
15 | }
16 | 
17 | td {
18 |     padding-right: 10px;
19 | }
20 | 
21 | td.field-desc {
22 |     font-size: 85%;
23 |     color: #555;
24 |     text-align: left;
25 | }
26 | 
27 | td.field-input {
28 |     text-align: left;
29 |     margin: 10px;
30 |     padding: 10px;
31 | }
32 | 
33 | td.field-label {
34 |     min-width: 15%;
35 |     font-weight: bold;
36 |     text-align: right;
37 | }
38 | 
39 | label.error {
40 |     font-size: 85%;
41 |     color: red;
42 |     text-align: left;
43 | }
44 | 
45 | div.field-error {
46 |     font-size: 85%;
47 |     color: red;
48 |     text-align: left;
49 |     border: 1px dotted red;
50 | }
51 | 
52 | body {
53 |   /* Padding to leave room for header and footer */
54 |   padding-top: 50px;
55 |   padding-bottom: 80px;
56 | }
57 | 
58 | .button-margin {
59 |     margin-top: 10px;
60 | }
61 | 
62 | html {
63 |   position: relative;
64 |   min-height: 100%;
65 | }
66 | 
67 | #footer {
68 |   position: absolute;
69 |   bottom: 0;
70 |   width: 100%;
71 |   /* Set the fixed height of the footer here */
72 |   height: 60px;
73 |   background-color: #eee;
74 | }
75 | 
76 | .no-select {
77 |     -webkit-touch-callout: none;
78 |     -webkit-user-select: none;
79 |     -khtml-user-select: none;
80 |     -moz-user-select: none;
81 |     -ms-user-select: none;
82 |     user-select: none;
83 | }
84 | 


--------------------------------------------------------------------------------
/http/analysis-service/static/worker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/http/analysis-service/static/worker.png


--------------------------------------------------------------------------------
/http/analysis-service/templates/boot-script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd /home/ubuntu
 4 | 
 5 | # Install a few dependencies
 6 | install()
 7 | {
 8 |     apt-get update
 9 |     DEBIAN_FRONTEND=noninteractive apt-get -y \
10 |         -o DPkg::Options::=--force-confdef \
11 |         -o DPkg::Options::=--force-confold \
12 |         install $@
13 | }
14 | install xz-utils python-pip git python-dev ntp python-protobuf python-snappy
15 | pip install --upgrade boto awscli simplejson
16 | 
17 | # Get users ssh key
18 | python - << END
19 | from boto.s3 import connect_to_region
20 | s3 = connect_to_region('{{ aws_region }}')
21 | b = s3.get_bucket('{{ temporary_bucket }}', validate = False)
22 | k = b.get_key('{{ ssh_key }}')
23 | k.get_contents_to_filename('/home/ubuntu/user_key.pub')
24 | END
25 | 
26 | {% if ephemeral_map %}
27 | # RAID0 Configuration:
28 | {% set raid_devices = ephemeral_map.keys()|sort %}
29 | {% set device_list = " ".join(raid_devices) %}
30 | install mdadm xfsprogs
31 | umount /mnt
32 | yes | mdadm --create /dev/md0 --level=0 -c64 --raid-devices={{ raid_devices|length }} {{ device_list }}
33 | echo 'DEVICE {{ device_list }}' >> /etc/mdadm/mdadm.conf
34 | mdadm --detail --scan >> /etc/mdadm/mdadm.conf
35 | mkfs.xfs /dev/md0
36 | mount /dev/md0 /mnt
37 | {% endif %}
38 | 
39 | # Setup users ssh_key
40 | cat /home/ubuntu/user_key.pub >> /home/ubuntu/.ssh/authorized_keys
41 | chmod 600 /home/ubuntu/.ssh/authorized_keys
42 | 
43 | # Set the default AWS region
44 | if [ ! -d /home/ubuntu/.aws ]; then
45 |   sudo -u ubuntu mkdir /home/ubuntu/.aws
46 | fi
47 | if [ ! -f /home/ubuntu/.aws/config ]; then
48 |   sudo -u ubuntu echo -e "[default]\nregion = {{ aws_region }}" > /home/ubuntu/.aws/config
49 | fi
50 | 
51 | # Make telemetry work dir
52 | if [ ! -d /mnt/telemetry ]; then
53 |   mkdir /mnt/telemetry
54 | fi
55 | chown ubuntu:ubuntu /mnt/telemetry
56 | 
57 | # Setup the motd
58 | sudo cat >/etc/motd <<END
59 | Welcome to a Telemetry Analysis worker node.
60 | 
61 | * For a quick intro, see:
62 |   http://mreid-moz.github.io/blog/2013/11/06/current-state-of-telemetry-analysis/
63 | 
64 | * The telemetry-server repository is at ~/telemetry-server
65 | 
66 | * Don't forget to copy any important code and data off this machine! It will
67 |   only be available for 24 hours, after which its contents will be...
68 |           ********************
69 |           *** GONE FOREVER ***
70 |           ********************
71 | 
72 | * If you get stuck, drop by #telemetry on irc.mozilla.org
73 | END
74 | 
75 | # Check out telemetry-server repo:
76 | sudo -u ubuntu git clone https://github.com/mozilla/telemetry-server.git
77 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/cluster/cluster.html:
--------------------------------------------------------------------------------
 1 | {% import 'macros.html' as macros with context %}
 2 | {% extends "base.html" %}
 3 | 
 4 | {% block cluster_active %} class="active" {% endblock %}
 5 | 
 6 | {% block title %}
 7 |  <h1>Launch an ad-hoc Spark cluster</h1>
 8 |  <p>
 9 |   <img width="48" src="{{ url_for('static', filename='cluster.png') }}" />
10 |   Launch a Spark cluster in the cloud and use it for custom data analysis.</p>
11 |  <p>
12 |   The cluster will be available for 24 hours, then it will be automatically
13 |   terminated.
14 |  </p>
15 |  <p>
16 |    For a guide of how to do data analysis using Spark, check out
17 |    <a href="http://wp.me/p45GUY-fu">
18 |      :rvitillo's blog post on the topic</a>.
19 |  </p>
20 | {% endblock %}
21 | {% block content %}
22 |  <p class="lead">Launch a cluster:</p>
23 |  <form action="{{ url_for('cluster_spawn') }}" method="post" enctype="multipart/form-data">
24 |   <input type="hidden" name="token" value="{{ token }}">
25 |   <table>
26 |    {% set username = current_user.email.split('@')[0] %}
27 | 
28 |    {% call macros.make_input('name', 'Cluster Name', 'text', username + '-telemetry-analysis') %}
29 |     The cluster name identifies this cluster within AWS. Use something short
30 |     like &quot;{{ username }}-charset-usage&quot;
31 |    {% endcall %}
32 | 
33 |    {% call macros.make_input('num_workers', 'Cluster Size', value='1') %}
34 |     The number of workers for this cluster. Please keep in mind to use resources
35 |     sparingly. Use a single worker to write and debug your job.
36 |    {% endcall %}
37 | 
38 |    {% call macros.make_input('public-ssh-key', 'SSH Public Key', type='file') %}
39 |     Your public key file. Usually <code>id_rsa.pub</code>. This will be appended
40 |     to the server's <code>authorized_keys</code> to grant you SSH access. Do
41 |     <strong>not</strong> upload a private key file.
42 |    {% endcall %}
43 |   </table>
44 |   <input type="submit" class="btn btn-success" name="submit" value="Submit">
45 |  </form>
46 | {% endblock %}
47 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/cluster/email.html:
--------------------------------------------------------------------------------
 1 | Hi,
 2 | <br>
 3 | <br>
 4 | We've launched an EMR cluster with access to telemetry published data, at your
 5 | request. As the cluster powers up you can:<br>
 6 | <ul>
 7 |   <li>monitor the cluster status,</li>
 8 |   <li>find the public DNS, and</li>
 9 |   <li>terminate the cluster,</li>
10 | </ul>
11 | here: <br>
12 | <br>
13 |   <b><a href="{{ monitoring_url }}">{{ monitoring_url }}</a></b>
14 | <br>
15 | <br>
16 | Please, be sure to kill your cluster when you're done with it.
17 | 
18 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/cluster/kill.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block cluster_active %} class="active" {% endblock %}
 4 | 
 5 | {% block title %}
 6 |  <h1>Your cluster is now dying, and will soon be dead.</h1>
 7 | {% endblock %}
 8 | 
 9 | {% block content %}
10 | <p class="lead">We've requested that your cluster ({{ jobflow_id }}) be killed.</p>
11 | 
12 | <div class="row">
13 |  <div class="col-md-2"><strong>ID:</strong></div>
14 |  <div class="col-md-10">{{ jobflow_id }}</div>
15 | </div>
16 | <div class="row">
17 |  <div class="col-md-2"><strong>State:</strong></div>
18 |  <div class="col-md-10">{{ jobflow_state }}</div>
19 | </div>
20 | <br/>
21 | {% endblock %}
22 | 
23 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/cluster/schedule_create.html:
--------------------------------------------------------------------------------
 1 | {% extends "cluster/schedule.html" %}
 2 | {% block content %}
 3 | <p class="lead">Success!</p>
 4 | 
 5 | <p>Your code has been uploaded to <code>{{ code_s3path }}</code>.</p>
 6 | <p>Any output files found in relative to where the notebook will be execute will
 7 |   be published at <code>{{ data_s3path }}</code>. The output files will overwrite
 8 |   anything already in that location in S3.</p>
 9 | <p>The job will be run <strong>{{ job_frequency }}</strong> at
10 |   <strong>{{ job_time }}{{ job_dow }}{{ job_dom }}</strong>.</p>
11 | <p>The job will be allowed to run for a max of
12 |   <strong>{{ job_timeout }}</strong> minutes, after which it will be killed.
13 | 
14 | <p>
15 |   Cron spec will be <code>{{ cron_spec }}</code>
16 | <p>
17 | <p>
18 |   <a class="btn btn btn-success" href="{{ url_for('cluster_schedule_job') }}">Go back</a>
19 | </p>
20 | {% endblock %}
21 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/cluster/schedule_delete.html:
--------------------------------------------------------------------------------
 1 | {% extends "cluster/schedule.html" %}
 2 | {% block title %}
 3 | <h1>Delete a scheduled Spark job</h1>
 4 | {% endblock %}
 5 | {% block content %}
 6 | <p class="lead">
 7 |   Job <code>{{ job.name }}</code> has been deleted (deleted
 8 |   {{ result.rowcount }} record{% if result.rowcount > 1 %}s{% endif %}).
 9 | </p>
10 | <p>
11 |   <a class="btn btn btn-success" href="{{ url_for('cluster_schedule_job') }}">Go back</a>
12 | </p>
13 | {% endblock %}
14 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/cluster/schedule_files.html:
--------------------------------------------------------------------------------
 1 | {% import 'macros.html' as macros with context %}
 2 | {% extends "base.html" %}
 3 | 
 4 | {% block cluster_schedule_active %} class="active" {% endblock %}
 5 | 
 6 | {% block title %}
 7 | <h1>View files for a scheduled Spark job</h1>
 8 | {% endblock %}
 9 | 
10 | {% block content %}
11 | {% if files %}
12 | <div class="files">
13 |   <p class="lead">
14 |     View {{ name }} output for job {{ job.id }}: <strong>{{ job.name }}</strong> below.
15 |   </p>
16 |   {% for f in files %}
17 |   <div class="row">
18 |     {% if f.url.endswith(".ipynb") %}
19 |     <p><a href="{{ "http://nbviewer.ipython.org/urls/" + f.url[8:] }}" target="_blank">{{ f.title|default(f.url) }}</a></p>
20 |     {% else %}
21 |     <p><a href="{{ f.url }}">{{ f.title|default(f.url) }}</a></p>
22 |     {% endif %}
23 |   </div>
24 |   {% endfor %}
25 | </div>
26 | <hr/>
27 | {% else %}
28 | <p class="lead">
29 |   There is no {{ name }} output for job <strong>{{ job.name }}</strong> yet.
30 |   Either it hasn't successfully run, or it has always timed out.
31 |   Current time limit is {{ job.timeout_minutes }} minutes.
32 | </p>
33 | {% endif %}
34 | <p>
35 |   <a class="btn btn btn-success" href="{{ url_for('cluster_schedule_job') }}">Go back</a>
36 | </p>
37 | {% endblock %}
38 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/index.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | {% block home_active %} class="active" {% endblock %}
 3 | {% block content %}
 4 | <div class="row">
 5 |  <div class="col-md-6">
 6 |   <h2>Launch a Spark Cluster</h2>
 7 |   <p>Launch a Spark cluster in the cloud and use it for custom data analysis.
 8 |   The cluster will be available for 24 hours, then it will be automatically
 9 |   terminated.</p>
10 |   <p class="lead">
11 |    <a href="{{ url_for('cluster_get_params') }}">
12 |     <img width="48" src="{{ url_for('static', filename='cluster.png') }}" />
13 |     Launch an ad-hoc Spark cluster
14 |    </a>
15 |   </p>
16 |  </div>
17 |  <div class="col-md-6">
18 |    <h2>Schedule a Spark Job</h2>
19 |    <p>Run a Spark analysis on a scheduled basis. The output of the analysis
20 |    will be published in Amazon S3.</p>
21 |    <p class="lead">
22 |     <a href="{{ url_for('cluster_schedule_job') }}">
23 |      <img width="48" src="{{ url_for('static', filename='schedule.png') }}" />
24 |      Schedule a Spark analysis job
25 |     </a>
26 |    </p>
27 |  </div>
28 | </div>
29 | <div class="row">
30 |  <div class="col-md-6">
31 |   <h2>Launch a Worker</h2>
32 |   <p>Launch a server in the cloud and use it for custom map-reduce data analysis.
33 |   The machine will be available for 24 hours, then it will be automatically
34 |   terminated.</p>
35 |   <p class="lead">
36 |    <a href="{{ url_for('get_worker_params') }}">
37 |     <img width="48" src="{{ url_for('static', filename='worker.png') }}" />
38 |     Launch an ad-hoc analysis worker
39 |    </a>
40 |   </p>
41 |  </div>
42 |  <div class="col-md-6">
43 |    <h2>Schedule a Job</h2>
44 |    <p>Run a map-reduce telemetry analysis on a scheduled basis. The output of the analysis
45 |    will be published in Amazon S3.</p>
46 |    <p class="lead">
47 |     <a href="{{ url_for('schedule_job') }}">
48 |      <img width="48" src="{{ url_for('static', filename='schedule.png') }}" />
49 |      Schedule an analysis job
50 |     </a>
51 |    </p>
52 |  </div>
53 | </div>
54 | {% endblock %}
55 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/instance-launched-email.html:
--------------------------------------------------------------------------------
 1 | Hi,<br>
 2 | We've launched an EC2 instance with access to telemetry published data, at your
 3 | request. As the instance powers up you can:<br>
 4 | <ul>
 5 |   <li>Monitor instance status,</li>
 6 |   <li>Find public DNS, and</li>
 7 |   <li>Terminate the instance,</li>
 8 | </ul>
 9 | here: {{ monitoring_url }}<br>
10 | <br>
11 | Please, be sure to kill your instance when you're done with it.


--------------------------------------------------------------------------------
/http/analysis-service/templates/kill.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block worker_active %} class="active" {% endblock %}
 4 | 
 5 | {% block title %}
 6 |  <h1>Your instance is now dying, and will soon be dead.</h1>
 7 | {% endblock %}
 8 | 
 9 | {% block content %}
10 | <p class="lead">We've requested that your instance ({{ instance_id }}) be killed.</p>
11 | 
12 | <div class="row">
13 |  <div class="col-md-2"><strong>ID:</strong></div>
14 |  <div class="col-md-10">{{ instance_id }}</div>
15 | </div>
16 | <div class="row">
17 |  <div class="col-md-2"><strong>State:</strong></div>
18 |  <div class="col-md-10">{{ instance_state }}</div>
19 | </div>
20 | <br/>
21 | <p class="lead">
22 |  You can still <a href="{{ monitoring_url }}">monitor it</a> just to
23 |  make sure it terminates.
24 | </p>
25 | {% endblock %}
26 | 
27 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/macros.html:
--------------------------------------------------------------------------------
 1 | {% macro make_input(name, label, type='text', value=None, required=True) -%}
 2 |  <tr>
 3 |   <td class="field-label"> <label> {{ label }}: </label> </td>
 4 |   <td class="field-input">
 5 |    {% if errors and name in errors %}
 6 |     <div class="field-error">
 7 |      {{ errors[name] }}
 8 |    {% endif %}
 9 |    <input type="{{ type }}" name="{{ name }}" {% if required %} required {% endif %}
10 |    {% if values and values[name] %}
11 |     value="{{ values[name] }}"
12 |    {% elif value %}
13 |     value="{{ value }}"
14 |    {% endif %}
15 |    {% if type == 'text' %}
16 |     size="35"
17 |    {% endif %} />
18 |    {% if errors and name in errors %}
19 |     </div>
20 |    {% endif %}
21 |   </td>
22 |   <td class="field-desc">
23 |    {{ caller() }}
24 |   </td>
25 |  </tr>
26 | {%- endmacro %}
27 | 
28 | {% macro begin_select(errors, name, label) -%}
29 |  <tr>
30 |     <td class="field-label"> <label> {{ label }}: </label> </td>
31 |     <td class="field-input">
32 |      {% if errors and name in errors %}
33 |       <div class="field-error">
34 |        {{ errors[name] }}
35 |      {% endif %}
36 |      <select name="{{ name }}" required>
37 | {%- endmacro %}
38 | 
39 | {% macro select_option(value, label, selected=False, name=None) -%}
40 |       <option value="{{ value }}"
41 |        {% if values[name] -%}
42 |         {% if values[name] == value %} selected {% endif -%}
43 |        {% elif selected %}
44 |         selected
45 |        {%- endif -%}>{{ label }}</option>
46 | {%- endmacro %}
47 | 
48 | {% macro end_select_options() -%}
49 |      </select>
50 |     </td>
51 |     <td class="field-desc">
52 | {%- endmacro %}
53 | 
54 | {% macro end_select() -%}
55 |   </td>
56 |  </tr>
57 | {%- endmacro %}
58 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/schedule_create.html:
--------------------------------------------------------------------------------
 1 | {% extends "schedule.html" %}
 2 | {% block content %}
 3 | <p class="lead">Success!</p>
 4 | 
 5 | <p>Your code has been uploaded to <code>{{ code_s3path }}</code>.</p>
 6 | <p>This tarball will be unpacked on a worker node, then <code>{{ commandline }}</code>
 7 | will be invoked.</p>
 8 | <p>Any output files found in <code>{{ output_dir }}</code> (relative to where
 9 | the tarball was unpacked) will be published at <code>{{ data_s3path }}</code>.
10 | The output files will overwrite anything already in that location in S3.</p>
11 | <p>The job will be run <strong>{{ job_frequency }}</strong> at
12 | <strong>{{ job_time }}{{ job_dow }}{{ job_dom }}</strong>.</p>
13 | <p>The job will be allowed to run for a max of
14 | <strong>{{ job_timeout }}</strong> minutes, after which it will be killed.
15 | 
16 | <p>
17 |  Cron spec will be <code>{{ cron_spec }}</code>
18 | <p>
19 | <p>
20 |  <a class="btn btn btn-success" href="{{ url_for('schedule_job') }}">Go back</a>
21 | </p>
22 | {% endblock %}
23 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/schedule_delete.html:
--------------------------------------------------------------------------------
 1 | {% extends "schedule.html" %}
 2 | {% block title %}
 3 |  <h1>Delete a scheduled analysis job</h1>
 4 | {% endblock %}
 5 | {% block content %}
 6 |  <p class="lead">
 7 |   Job <code>{{ job.name }}</code> has been deleted (deleted
 8 |   {{ result.rowcount }} record{% if result.rowcount > 1 %}s{% endif %}).
 9 |  </p>
10 |  <p>
11 |   <a class="btn btn btn-success" href="{{ url_for('schedule_job') }}">Go back</a>
12 |  </p>
13 | {% endblock %}
14 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/schedule_files.html:
--------------------------------------------------------------------------------
 1 | {% import 'macros.html' as macros with context %}
 2 | {% extends "base.html" %}
 3 | 
 4 | {% block schedule_active %} class="active" {% endblock %}
 5 | 
 6 | {% block title %}
 7 |  <h1>View files for a scheduled job</h1>
 8 | {% endblock %}
 9 | 
10 | {% block content %}
11 |  {% if files %}
12 |   <div class="files">
13 |    <p class="lead">
14 |     View {{ name }} output for job {{ job.id }}: <strong>{{ job.name }}</strong> below.
15 |    </p>
16 |    {% for f in files %}
17 |     <div class="row">
18 |      <p><a href="{{ f.url }}">{{ f.title|default(f.url) }}</a></p>
19 |     </div>
20 |    {% endfor %}
21 |   </div>
22 |   <hr/>
23 |  {% else %}
24 |   <p class="lead">
25 |    There is no {{ name }} output for job <strong>{{ job.name }}</strong> yet.
26 |    Either it hasn't successfully run, or it has always timed out.
27 |    Current time limit is {{ job.timeout_minutes }} minutes.
28 |   </p>
29 |  {% endif %}
30 | {% endblock %}
31 | 


--------------------------------------------------------------------------------
/http/analysis-service/templates/worker.html:
--------------------------------------------------------------------------------
 1 | {% import 'macros.html' as macros with context %}
 2 | {% extends "base.html" %}
 3 | 
 4 | {% block worker_active %} class="active" {% endblock %}
 5 | 
 6 | {% block title %}
 7 |  <h1>Launch an ad-hoc analysis worker</h1>
 8 |  <p>
 9 |   <img width="48" src="{{ url_for('static', filename='worker.png') }}" />
10 |   Launch a server in the cloud and use it for custom data analysis.</p>
11 |  <p>
12 |   The machine will be available for 24 hours, then it will be automatically
13 |   terminated.
14 |  </p>
15 |  <p>
16 |   For more info on how to use run a custom analysis, check out
17 |   <a href="//mreid-moz.github.io/blog/2013/11/06/current-state-of-telemetry-analysis/">
18 |   :mreid's blog post on the topic</a>.
19 |  </p>
20 |  <p>
21 |   Or you can read the docs for the
22 |   <a href="//github.com/mozilla/telemetry-server/blob/master/docs/MapReduce.md">
23 |   Telemetry MapReduce framework</a>.
24 |  </p>
25 | {% endblock %}
26 | {% block content %}
27 |  <p class="lead">Launch a worker:</p>
28 |  <form action="{{ url_for('spawn_worker_instance') }}" method="post" enctype="multipart/form-data">
29 |   <input type="hidden" name="token" value="{{ token }}">
30 |   <table>
31 |    {% set username = current_user.email.split('@')[0] %}
32 |    {% call macros.make_input('name', 'Server Name', 'text', username + '-telemetry-analysis') %}
33 |     The server name identifies this machine within AWS. Use something short
34 |     like &quot;{{ username }}-charset-usage&quot;
35 |    {% endcall %}
36 | 
37 |    {% call macros.make_input('public-ssh-key', 'SSH Public Key', type='file') %}
38 |     Your public key file. Usually <code>id_rsa.pub</code>. This will be appended
39 |     to the server's <code>authorized_keys</code> to grant you SSH access. Do
40 |     <strong>not</strong> upload a private key file.
41 |    {% endcall %}
42 |   </table>
43 |   <input type="submit" class="btn btn-success" name="submit" value="Submit">
44 |  </form>
45 | {% endblock %}
46 | 


--------------------------------------------------------------------------------
/http/analysis-service/terminate-expired-instances.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | def main():
4 |     print "ATMO v1 is no longer in charge of terminating expired clusters."
5 | 
6 | if __name__ == '__main__':
7 |     main()
8 | 


--------------------------------------------------------------------------------
/http/analysis-service/user.py:
--------------------------------------------------------------------------------
 1 | from flask.ext.login import UserMixin, AnonymousUserMixin
 2 | 
 3 | class User(UserMixin):
 4 |     def __init__(self, email):
 5 |         self.email = email
 6 | 
 7 |     def is_authenticated(self):
 8 |         return self.email != None
 9 | 
10 |     def is_authorized(self):
11 |         return self.email.endswith('mozilla.com') or self.email.endswith('mozilla.org')
12 | 
13 |     def is_active(self):
14 |         return self.email != None
15 | 
16 |     def is_anonymous(self):
17 |         return self.email == None
18 | 
19 |     def get_id(self):
20 |         return self.email
21 | 
22 | class AnonymousUser(AnonymousUserMixin):
23 |     def is_authorized(self):
24 |         return False


--------------------------------------------------------------------------------
/http/server_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "motd": " ==== Telemetry Server. Accepting Submissions since 2013. ====",
 3 |   "max_data_length": 204800,
 4 |   "max_path_length": 10240,
 5 |   "port": 8080,
 6 |   "log_path": "/mnt/telemetry/data",
 7 |   "max_log_age_ms": 300000,
 8 |   "max_log_size": 524288000,
 9 |   "include_request_ip": true,
10 |   "stats_log_file": "/var/log/telemetry/telemetry-server.log"
11 | }
12 | 


--------------------------------------------------------------------------------
/mapreduce/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/mapreduce/__init__.py


--------------------------------------------------------------------------------
/mapreduce/addon_perf/README.md:
--------------------------------------------------------------------------------
 1 | Telemetry map/reduce to analyze bootstrap add-on performance probes
 2 | ===================================================================
 3 | 
 4 | Files:
 5 | 
 6 | - addon_perf.py: Telemetry map/reduce to crunch raw data into lines describing:
 7 |     - exceptions caught during add-on manager initialization
 8 |     - histogram of time taken by add-on file scans and bootstrap methods
 9 | 
10 | - combine.py: merge outputs from am_exceptions.py and generate .csv format summaries
11 |     - weekly-addons-{date}.csv.gz
12 |     - weekly-exceptions-{date}.csv.ga
13 | 
14 | - run.sh: driver script for Telemetry scheduled daily job - downloads actual M/R code
15 |   from Github and executes job
16 | 
17 | - processAddonPerf.py: analysis script that runs the telemetry M/R job using addon_perf.py
18 |   and then produces the output files by gathering the week's data from S3 and running combine.py
19 | 
20 | - filter_template.json: template for M/R job filter; processExceptions.py creates a copy
21 |   with the desired date for each M/R run
22 | 


--------------------------------------------------------------------------------
/mapreduce/addon_perf/addon-perf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "sort-options": {
 3 |     "values": ["Impact", "Popularity", "Median (ms)", "75% (ms)"],
 4 |     "selected": "Impact"
 5 |   },
 6 |   "filter-options": [
 7 |     {"id": "Application",
 8 |      "values": ["", "Firefox", "Fennec"],
 9 |      "selected": ""},
10 |     {"id": "Platform",
11 |      "values": ["", "WINNT", "Linux", "Darwin", "Android"],
12 |      "selected": ""},
13 |     {"id": "Measure",
14 |      "values": ["", "startup_MS", "shutdown_MS"],
15 |      "selected": ""},
16 |     {"id": "Limit",
17 |      "values": [10, 20, 50],
18 |      "selected": 10}
19 |   ],
20 |   "title": ["Telemetry Add-on Performance", "Bootstrap add-on start up and shut down times"],
21 |   "primary-key": ["Application", "Platform", "Addon ID", "Version", "Measure"],
22 |   "header": ["Application", "Platform", "Addon ID", "Version", "Name", "Measure",
23 |              "Popularity", "Impact", "Median (ms)", "75% (ms)", "95% (ms)"],
24 |   "url-prefix": "https://s3-us-west-2.amazonaws.com/telemetry-public-analysis/addon_perf/data/weekly_addons"
25 | }
26 | 


--------------------------------------------------------------------------------
/mapreduce/addon_perf/addon-scan.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "sort-options": {
 3 |     "values": ["Impact", "Popularity", "Median Count", "Median Time (ms)", "75% (ms)"],
 4 |     "selected": "Impact"
 5 |   },
 6 |   "filter-options": [
 7 |     {"id": "Application",
 8 |      "values": ["", "Firefox", "Fennec"],
 9 |      "selected": ""},
10 |     {"id": "Platform",
11 |      "values": ["", "WINNT", "Linux", "Darwin", "Android"],
12 |      "selected": ""},
13 |     {"id": "Limit",
14 |      "values": [10, 20, 50],
15 |      "selected": 10}
16 |   ],
17 |   "title": ["Unpacked Add-on File Scan", "Start-up scan times for unpacked add-ons"],
18 |   "primary-key": ["Application", "Platform", "Addon ID", "Version"],
19 |   "header": ["Application", "Platform", "Addon ID", "Version", "Name", "Popularity",
20 |              "Impact", "Median Count", "Median Time (ms)", "75% (ms)", "95% (ms)"],
21 |   "url-prefix": "https://s3-us-west-2.amazonaws.com/telemetry-public-analysis/addon_perf/data/weekly_unpacked"
22 | }
23 | 


--------------------------------------------------------------------------------
/mapreduce/addon_perf/addon-versions.py:
--------------------------------------------------------------------------------
 1 | # Process daily add-on telemetry extract to see variation in add-on version #
 2 | # usage: addon-versions.py input-filename [input-filename ...]
 3 | 
 4 | import unicodecsv as ucsv
 5 | import simplejson as json
 6 | import gzip
 7 | import sys
 8 | import re
 9 | from collections import defaultdict, Counter
10 | 
11 | APP_COLUMN=1
12 | OS_COLUMN=2
13 | VER_COLUMN=3
14 | CHAN_COLUMN=4
15 | TEXT_COLUMN=5
16 | 
17 | # Keep track of how many version #s we see for an add-on ID
18 | addonVersions = defaultdict(Counter)
19 | # Keep track of how many different names we see for a given add-on ID
20 | addonNames = defaultdict(Counter)
21 | 
22 | # Total number of pings received
23 | sessions = 0
24 | 
25 | for a in sys.argv[1:]:
26 |     print "processing", a
27 |     with gzip.open(a, 'rb') as f:
28 | 
29 |         for line in f:
30 |             try:
31 |                 keyblob, datablob = line.split("\t", 1)
32 |                 key = json.loads(keyblob)
33 | 
34 |                 if key[0] == "E":
35 |                     if key[5] == 'Sessions':
36 |                         sessions += int(datablob)
37 |                     continue
38 |                 (addonID, sep, version) = key[5].rpartition(':')
39 |                 data = json.loads(datablob)
40 |                 addonVersions[addonID][version] += sum(data['name'].values())
41 |                 addonNames[addonID].update(data['name'])
42 |             except Exception as e:
43 |                 print "Bad line: " + str(e) + ": " + line
44 |                 continue
45 | 
46 | # Get the most popular name for the add-on, collapsing ugly broken unicode
47 | rx = re.compile(u'\ufffd+')
48 | def getName(addonID):
49 |     names = addonNames.get(addonID, {})
50 |     if "?" in names:
51 |         del names["?"]
52 |     if len(names) < 1:
53 |         return "?"
54 |     return max(names, key=names.get)
55 | 
56 | print sessions, "sessions,", len(addonVersions), "different add-on IDs"
57 | 
58 | # Things worth knowing?
59 | # Total different add-on IDs
60 | # total # sessions
61 | # count of IDs that have more than one version
62 | # for each ID: most popular name, total count, # versions, count of most popular version
63 | 
64 | writer = ucsv.writer(sys.stdout)
65 | writer.writerow(['Add-on ID', 'name', 'total', 'versions', 'mainVersion', 'count'])
66 | 
67 | # add-ons with more than one version
68 | multiversion = 0
69 | 
70 | for addonID, counts in addonVersions.iteritems():
71 |     name = getName(addonID)
72 |     total = sum(counts.values())
73 |     versions = len(counts)
74 |     if '?' in counts:
75 |         # Don't count the 'disabled' version
76 |         versions = versions - 1
77 |     if versions < 2:
78 |         continue
79 | 
80 |     multiversion += 1
81 |     version, count = max(counts.iteritems(), key=lambda k:k[1])
82 | 
83 |     writer.writerow([addonID, name, total, versions, version, count])
84 | 
85 | print
86 | print multiversion, "add-ons with more than one version"
87 | 


--------------------------------------------------------------------------------
/mapreduce/addon_perf/filter_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["saved-session"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": "*"
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": "*"
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |       "allowed_values": "*"
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": ["__TARGET_DATE__"]
27 |     }
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/mapreduce/addon_perf/package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | VERSION=0.2
 3 | NAME=addon_perf
 4 | TARBALL=${NAME}-$VERSION.tar.gz
 5 | 
 6 | if [ -f "$TARBALL" ]; then
 7 |     rm -v "$TARBALL"
 8 | fi
 9 | tar czvf "$TARBALL" \
10 |         run.sh \
11 |         README.md
12 | 


--------------------------------------------------------------------------------
/mapreduce/addon_perf/processAddonPerf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BASE=$(pwd)
 4 | echo "Working in directory $BASE"
 5 | 
 6 | WORK="$BASE/work"
 7 | OUTPUT="$BASE/output"
 8 | TODAY=$(date +%Y%m%d)
 9 | mkdir -p "$OUTPUT"
10 | mkdir -p "$WORK/cache"
11 | 
12 | # If we have an argument, process that day.
13 | TARGET=$1
14 | if [ -z "$TARGET" ]; then
15 |   # Default to processing "yesterday"
16 |   TARGET=$(date -d 'yesterday' +%Y%m%d)
17 | fi
18 | 
19 | cd telemetry-server
20 | JOB="mapreduce/addon_perf"
21 | 
22 | FILTER="$WORK/filter.json"
23 | echo "Today is $TODAY, and we're gathering addon_perf data for $TARGET"
24 | sed -r "s/__TARGET_DATE__/$TARGET/" $JOB/filter_template.json > $FILTER
25 | 
26 | DATA_FILE="$OUTPUT/addon_perf${TARGET}.csv"
27 | 
28 | echo "Starting the addon_perf export for $TARGET"
29 | python -u -m mapreduce.job $JOB/addon_perf.py \
30 |   --num-mappers 8 \
31 |   --num-reducers 8 \
32 |   --input-filter $FILTER \
33 |   --data-dir "$WORK/cache" \
34 |   --work-dir $WORK \
35 |   --output $DATA_FILE \
36 |   --bucket telemetry-published-v2
37 | 
38 | echo "Mapreduce job exited with code: $?"
39 | 
40 | echo "compressing"
41 | gzip $DATA_FILE
42 | echo "Done!"
43 | 
44 | echo "Processing weekly data"
45 | cd $BASE
46 | mkdir -p "weekly"
47 | cd weekly
48 | 
49 | # Monday is day 1
50 | OFFSET=$(( $(date -d $TARGET +%u) - 1 ))
51 | MONDAY=$(date -d "$TARGET - $OFFSET days" +%Y%m%d)
52 | SUNDAY=$(date -d "$MONDAY + 6 days" +%Y%m%d)
53 | echo "For target '$TARGET', week is $MONDAY to $SUNDAY"
54 | for f in $(seq 0 6); do
55 |     DAY=$(date -d "$MONDAY + $f days" +%Y%m%d)
56 |     if [ "$DAY" -eq "$TARGET" ]; then
57 |         echo "Using local file for today ($DAY)"
58 |         cp ${DATA_FILE}.gz .
59 |     else
60 |         echo "Fetching $DAY"
61 | 	aws s3 cp s3://telemetry-public-analysis/addon_perf/data/addon_perf$DAY.csv.gz ./addon_perf$DAY.csv.gz
62 |     fi
63 | done
64 | echo "Creating weekly data for $MONDAY to $SUNDAY"
65 | python $BASE/telemetry-server/$JOB/combine.py "$OUTPUT" "$MONDAY" *
66 | echo "Created weekly output file:"
67 | ls -l $OUTPUT/
68 | 
69 | echo "Copying iacomus configs to s3"
70 | cp $BASE/telemetry-server/$JOB/addon-perf.json $BASE/telemetry-server/$JOB/addon-scan.json $OUTPUT
71 | 


--------------------------------------------------------------------------------
/mapreduce/addon_perf/run-fork.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install additional python modules used by addon_perf analysis
 4 | sudo pip install unicodecsv
 5 | 
 6 | # Replace the default telemetry-server install with our own
 7 | rm -rf telemetry-server
 8 | git clone https://github.com/irvingreid/telemetry-server.git
 9 | (cd telemetry-server; git checkout addon-nightly)
10 | 
11 | # Now run the actually processing job, using the code from Irving's github
12 | time telemetry-server/mapreduce/addon_perf/processAddonPerf.sh
13 | 


--------------------------------------------------------------------------------
/mapreduce/addon_perf/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Install additional python modules used by addon_perf analysis
4 | sudo pip install unicodecsv
5 | 
6 | # Now run the actually processing job
7 | telemetry-server/mapreduce/addon_perf/processAddonPerf.sh
8 | 


--------------------------------------------------------------------------------
/mapreduce/chromehangs/chromehangs.py:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | # ChromeHangs export, ported from:
 6 | #   https://github.com/mozilla-metrics/telemetry-toolbox
 7 | 
 8 | try:
 9 |     import simplejson as json
10 | except ImportError:
11 |     import json
12 | 
13 | def check_obj(key, o):
14 |     return len(o.get(key, {}).get("memoryMap", [])) > 0
15 | 
16 | def map(k, v, cx):
17 |     try:
18 |         o = v["payload"]
19 |         if check_obj("chromeHangs", o) or check_obj("lateWrites", o):
20 |             # see https://github.com/mozilla/python_moztelemetry/issues/8
21 |             cx.write(k, json.dumps({"chromeHangs": dict(o.get("chromeHangs", {}).items()),
22 |                                     "lateWrites": dict(o.get("lateWrites", {}).items()),
23 |                                     "meta": v.get("meta", {})}))
24 |     except Exception as e:
25 |         print str(e)
26 | 


--------------------------------------------------------------------------------
/mapreduce/chromehangs/combine_week.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If we have a target argument, process that day.
 4 | TARGET=$1
 5 | if [ -z "$TARGET" ]; then
 6 |   # Default to processing "yesterday"
 7 |   TARGET=$(date -d 'yesterday' +%Y%m%d)
 8 | fi
 9 | NAME=$2
10 | if [ -z "$NAME" ]; then
11 |   NAME=ChromeHangsWeekly
12 | fi
13 | 
14 | OUTPUT=$3
15 | if [ -z "$OUTPUT" ]; then
16 |   OUTPUT=output
17 | fi
18 | 
19 | BASE=$(pwd)
20 | DATA_FILE=$BASE/$OUTPUT/chromehangs-common-$TARGET.csv.gz
21 | 
22 | if [ ! -d "weekly" ]; then
23 |     mkdir -p "weekly"
24 | fi
25 | cd weekly
26 | # Monday is day 1
27 | OFFSET=$(( $(date -d $TARGET +%u) - 1 ))
28 | MONDAY=$(date -d "$TARGET - $OFFSET days" +%Y%m%d)
29 | SUNDAY=$(date -d "$MONDAY + 6 days" +%Y%m%d)
30 | echo "For target '$TARGET', week is $MONDAY to $SUNDAY"
31 | for f in $(seq 0 6); do
32 |     DAY=$(date -d "$MONDAY + $f days" +%Y%m%d)
33 |     if [ "$DAY" -eq "$TARGET" -a -f "$DATA_FILE" ]; then
34 |         echo "Using target local file for today ($DAY)"
35 |         cp ${DATA_FILE} ./
36 |     elif [ -f "$BASE/chromehangs-common-$DAY.csv.gz" ]; then
37 |         echo "Already have local file for $DAY"
38 |         cp "$BASE/chromehangs-common-$DAY.csv.gz" ./
39 |     else
40 |         echo "Fetching $DAY"
41 |         aws s3 cp s3://telemetry-public-analysis-2/$NAME/data/chromehangs-common-$DAY.csv.gz ./
42 |     fi
43 | done
44 | echo "Creating weekly data for $MONDAY to $SUNDAY"
45 | python $BASE/combine.py $BASE/$OUTPUT $MONDAY $SUNDAY
46 | echo "Done!"
47 | 


--------------------------------------------------------------------------------
/mapreduce/chromehangs/filter_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "prefix",
 6 |       "allowed_values": "telemetry-2"
 7 |     },
 8 |     {
 9 |       "field_name": "submissionDate",
10 |       "allowed_values": ["__TARGET_DATE__"]
11 |     },
12 |     {
13 |       "field_name": "sourceName",
14 |       "allowed_values": "telemetry"
15 |     },
16 |     {
17 |       "field_name": "sourceVersion",
18 |       "allowed_values": "4"
19 |     },
20 |     {
21 |       "field_name": "docType",
22 |       "allowed_values": "saved_session"
23 |     },
24 |     {
25 |       "field_name": "appName",
26 |       "allowed_values": "Firefox"
27 |     },
28 |     {
29 |       "field_name": "appUpdateChannel",
30 |       "allowed_values": "nightly"
31 |     },
32 |     {
33 |       "field_name": "appVersion",
34 |       "allowed_values": "*"
35 |     },
36 |     {
37 |       "field_name": "appBuildId",
38 |       "allowed_values": "*"
39 |     }
40 |   ]
41 | }
42 | 


--------------------------------------------------------------------------------
/mapreduce/chromehangs/package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | VERSION=0.4
 3 | NAME=chromehangs
 4 | TARBALL=${NAME}-$VERSION.tar.gz
 5 | 
 6 | if [ -f "$TARBALL" ]; then
 7 |     rm -v "$TARBALL"
 8 | fi
 9 | tar czvf "$TARBALL" \
10 |         filter_template.json \
11 |         run.sh \
12 |         run_public.sh \
13 |         symbolicate.py \
14 |         extract_common_stacks.py \
15 |         combine.py \
16 |         combine_week.sh \
17 |         chromehangs.py
18 | 
19 | echo "Packaged $NAME code as $TARBALL"
20 | 


--------------------------------------------------------------------------------
/mapreduce/chromehangs/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OUTPUT=output
 4 | NAME=ChromeHangs
 5 | TODAY=$(date +%Y%m%d)
 6 | if [ ! -d "$OUTPUT" ]; then
 7 |     mkdir -p "$OUTPUT"
 8 | fi
 9 | 
10 | if [ ! -d "temp" ]; then
11 |     mkdir -p "temp"
12 | fi
13 | if [ ! -d "work" ]; then
14 |     mkdir -p "work"
15 | fi
16 | if [ ! -d "data" ]; then
17 |     mkdir -p "data"
18 | fi
19 | 
20 | # If we have an argument, process that day.
21 | TARGET=$1
22 | if [ -z "$TARGET" ]; then
23 |   # Default to processing "yesterday"
24 |   TARGET=$(date -d 'yesterday' +%Y%m%d)
25 | fi
26 | 
27 | echo "Today is $TODAY, and we're gathering $NAME data for $TARGET"
28 | sed -r "s/__TARGET_DATE__/$TARGET/" filter_template.json > filter.json
29 | 
30 | BASE=$(pwd)
31 | RAW_DATA_FILE=$BASE/$OUTPUT/chromehangs-raw-$TARGET.txt
32 | FINAL_DATA_FILE=$BASE/$OUTPUT/chromehangs-symbolicated-$TARGET.txt.gz
33 | COMBINED_DATA_FILE=$BASE/$OUTPUT/chromehangs-common-$TARGET.txt
34 | 
35 | cd ~/telemetry-server
36 | echo "Starting the $NAME export for $TARGET"
37 | python -u -m mapreduce.hekajob $BASE/chromehangs.py \
38 |   --delete-data \
39 |   --num-mappers 16 \
40 |   --input-filter $BASE/filter.json \
41 |   --data-dir $BASE/data \
42 |   --work-dir $BASE/work \
43 |   --output $RAW_DATA_FILE \
44 |   --bucket "net-mozaws-prod-us-west-2-pipeline-data"
45 | 
46 | echo "Mapreduce job exited with code: $?"
47 | 
48 | cd -
49 | echo "Looking for 'error' lines:"
50 | grep -e "^Error," $RAW_DATA_FILE
51 | echo "End of error lines."
52 | 
53 | echo "Symbolicating outputs..."
54 | time python symbolicate.py -i $RAW_DATA_FILE -o $FINAL_DATA_FILE -d $TARGET &> symbolicate.out
55 | SYMBOLICATE_CODE=$?
56 | 
57 | if [ $SYMBOLICATE_CODE -eq 0 ]; then
58 |     echo "Symbolication succeeded (exited with code $SYMBOLICATE_CODE)"
59 | else
60 |     echo "Symbolication failed (exited with code $SYMBOLICATE_CODE). Log:"
61 |     cat symbolicate.out
62 | fi
63 | 
64 | echo "Extracting common stacks..."
65 | time python extract_common_stacks.py -i $FINAL_DATA_FILE -o $COMBINED_DATA_FILE
66 | 
67 | echo "Compressing raw output..."
68 | gzip $RAW_DATA_FILE
69 | 
70 | echo "Compressing combined stacks..."
71 | gzip $COMBINED_DATA_FILE
72 | 
73 | echo "Done!"
74 | 


--------------------------------------------------------------------------------
/mapreduce/chromehangs/run_public.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OUTPUT=output
 4 | NAME=ChromeHangsWeekly
 5 | TODAY=$(date +%Y%m%d)
 6 | if [ ! -d "$OUTPUT" ]; then
 7 |     mkdir -p "$OUTPUT"
 8 | fi
 9 | 
10 | if [ ! -d "temp" ]; then
11 |     mkdir -p "temp"
12 | fi
13 | if [ ! -d "work" ]; then
14 |     mkdir -p "work"
15 | fi
16 | if [ ! -d "data" ]; then
17 |     mkdir -p "data"
18 | fi
19 | 
20 | # If we have an argument, process that day.
21 | TARGET=$1
22 | if [ -z "$TARGET" ]; then
23 |   # Default to processing "yesterday"
24 |   TARGET=$(date -d 'yesterday' +%Y%m%d)
25 | fi
26 | 
27 | echo "Today is $TODAY, and we're gathering $NAME data for $TARGET"
28 | sed -r "s/__TARGET_DATE__/$TARGET/" filter_template.json > filter.json
29 | 
30 | BASE=$(pwd)
31 | RAW_DATA_FILE=$BASE/chromehangs-raw-$TARGET.txt
32 | FINAL_DATA_FILE=$BASE/chromehangs-symbolicated-$TARGET.txt.gz
33 | COMBINED_DATA_FILE=$BASE/$OUTPUT/chromehangs-common-$TARGET.csv
34 | 
35 | cd ~/telemetry-server
36 | echo "Starting the $NAME export for $TARGET"
37 | python -u -m mapreduce.hekajob $BASE/chromehangs.py \
38 |   --delete-data \
39 |   --num-mappers 16 \
40 |   --input-filter $BASE/filter.json \
41 |   --data-dir $BASE/data \
42 |   --work-dir $BASE/work \
43 |   --output $RAW_DATA_FILE \
44 |   --bucket "net-mozaws-prod-us-west-2-pipeline-data"
45 | 
46 | echo "Mapreduce job exited with code: $?"
47 | 
48 | cd -
49 | echo "Looking for 'error' lines:"
50 | grep -e "^Error," $RAW_DATA_FILE
51 | echo "End of error lines."
52 | 
53 | echo "Symbolicating outputs..."
54 | time python symbolicate.py -i $RAW_DATA_FILE -o $FINAL_DATA_FILE -d $TARGET &> symbolicate.out
55 | SYMBOLICATE_CODE=$?
56 | 
57 | if [ $SYMBOLICATE_CODE -eq 0 ]; then
58 |     echo "Symbolication succeeded (exited with code $SYMBOLICATE_CODE)"
59 | else
60 |     echo "Symbolication failed (exited with code $SYMBOLICATE_CODE). Log:"
61 |     cat symbolicate.out
62 | fi
63 | 
64 | echo "Extracting common stacks..."
65 | time python extract_common_stacks.py -i $FINAL_DATA_FILE -o $COMBINED_DATA_FILE
66 | 
67 | echo "Compressing combined stacks..."
68 | gzip $COMBINED_DATA_FILE
69 | 
70 | echo "Processing weekly data"
71 | cd $BASE
72 | bash combine_week.sh "$TARGET" "$NAME" "$OUTPUT"
73 | echo "Done!"
74 | 


--------------------------------------------------------------------------------
/mapreduce/examples/heka/distribution.py:
--------------------------------------------------------------------------------
 1 | # Same as the osdistribution.py example in jydoop
 2 | import json
 3 | 
 4 | def map(k, v, cx):
 5 |     os = v['environment']['system']['os']['name']
 6 |     cx.write(os, 1)
 7 | 
 8 | def reduce(k, v, cx):
 9 |     cx.write(k, sum(v))
10 | 


--------------------------------------------------------------------------------
/mapreduce/examples/heka/filter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "prefix",
 6 |       "allowed_values": "telemetry-2"
 7 |     },
 8 |     {
 9 |       "field_name": "submissionDate",
10 |       "allowed_values": "20150824"
11 |     },
12 |     {
13 |       "field_name": "sourceName",
14 |       "allowed_values": "telemetry"
15 |     },
16 |     {
17 |       "field_name": "sourceVersion",
18 |       "allowed_values": "4"
19 |     },
20 |     {
21 |       "field_name": "docType",
22 |       "allowed_values": "saved_session"
23 |     },
24 |     {
25 |       "field_name": "appName",
26 |       "allowed_values": "Firefox"
27 |     },
28 |     {
29 |       "field_name": "appUpdateChannel",
30 |       "allowed_values": "nightly"
31 |     },
32 |     {
33 |       "field_name": "appVersion",
34 |       "allowed_values": "43.0a1"
35 |     },
36 |     {
37 |       "field_name": "appBuildId",
38 |       "allowed_values": "20150810122907"
39 |     }
40 |   ]
41 | }
42 | 


--------------------------------------------------------------------------------
/mapreduce/examples/heka/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd "$(dirname "$0")"
 4 | DIR=$PWD
 5 | cd ../../../
 6 | mkdir -p /tmp/telemetry/work/cache
 7 | 
 8 | python -m mapreduce.hekajob $DIR/distribution.py \
 9 |        --delete-data \
10 |        --input-filter $DIR/filter.json \
11 |        --num-mappers 16 \
12 |        --num-reducers 4 \
13 |        --data-dir /tmp/telemetry/work \
14 |        --work-dir /tmp/telemetry/work \
15 |        --output /tmp/telemetry/my_mapreduce_results.out \
16 |        --bucket "net-mozaws-prod-us-west-2-pipeline-data"
17 | 


--------------------------------------------------------------------------------
/mapreduce/examples/v2/dims_only.py:
--------------------------------------------------------------------------------
1 | def map(key, dims, value, context):
2 |     submission_day = dims[-1]
3 |     context.write(submission_day, 1)
4 | 
5 | def reduce(key, values, context):
6 |     context.write(key, sum(values))
7 | 


--------------------------------------------------------------------------------
/mapreduce/examples/v2/distribution.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Get the distribution of one or more boolean/enumerated measurements.
 3 | """
 4 | 
 5 | import json
 6 | 
 7 | keys = [
 8 |     ("NEWTAB_PAGE_SHOWN", 2), # boolean
 9 |     ("NEWTAB_PAGE_SITE_CLICKED", 10), # 9-bucket
10 | ]
11 | 
12 | extra_histogram_entries = 6 # bucketN, sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi
13 | 
14 | def map(k, d, v, cx):
15 |     j = json.loads(v)
16 |     histograms = j.get("histograms", {})
17 | 
18 |     counts = ()
19 |     for key, buckets in keys:
20 |         if key in histograms:
21 |             val = histograms[key]
22 |             if len(val) != buckets + extra_histogram_entries:
23 |                 raise ValueError("Unexpected length for key %s: %s" % (key, val))
24 |             counts += tuple(val[0:buckets])
25 |         else:
26 |             counts += (0,) * buckets
27 |     cx.write(counts, 1)
28 | 
29 | def reduce(k, v, cx):
30 |     cx.writecsv(list(k) + [sum(v)])
31 | 


--------------------------------------------------------------------------------
/mapreduce/examples/v2/filter-nightly-buildid.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["saved-session", "idle-daily"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": ["Firefox"]
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": ["nightly"]
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |       "allowed_values": {
23 |         "min": "20140228"
24 |       }
25 |     },
26 |     {
27 |       "field_name": "submission_date",
28 |       "allowed_values": {
29 |         "min": "20140308",
30 |         "max": "20140309"
31 |       }
32 |     }
33 |   ]
34 | }
35 | 


--------------------------------------------------------------------------------
/mapreduce/examples/v2/filter_include_all.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": "*"
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": "*"
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": "*"
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |      "allowed_values": "*"
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": "*"
27 |     }
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/mapreduce/examples/v2/filter_max_buildid.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["saved-session"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": ["Firefox","Fennec"]
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": ["nightly", "aurora"]
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |       "allowed_values": { "max": "20130607" }
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": "*"
27 |     }
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/mapreduce/examples/v2/filter_min_buildid.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["saved-session"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": ["Firefox","Fennec"]
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": ["nightly", "aurora"]
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |       "allowed_values": { "min": "20130600" }
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": "*"
27 |     }
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/mapreduce/examples/v2/filter_min_max_buildid.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["saved-session"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": ["Firefox","Fennec"]
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": ["nightly", "aurora"]
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |       "allowed_values": { "min": "20130600", "max": "20130607" }
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": "*"
27 |     }
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/mapreduce/examples/v2/filter_saved_session_Fx_prerelease.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["saved-session"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": ["Firefox","Fennec"]
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": ["nightly", "aurora"]
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |       "allowed_values": "*"
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": {
27 |         "min": "20131101",
28 |         "max": "20131103"
29 |       }
30 |     }
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/mapreduce/examples/v2/osdistribution.py:
--------------------------------------------------------------------------------
 1 | # Same as the osdistribution.py example in jydoop
 2 | import json
 3 | 
 4 | def map(k, d, v, cx):
 5 |     j = json.loads(v)
 6 |     os = j['info']['OS']
 7 |     cx.write(os, 1)
 8 | 
 9 | def reduce(k, v, cx):
10 |     cx.write(k, sum(v))
11 | 


--------------------------------------------------------------------------------
/mapreduce/examples/v2/simple_counter.py:
--------------------------------------------------------------------------------
 1 | # A very simple MR job to simply count the number of 
 2 | # occurrences of each key. Useful for investigating
 3 | # the number of duplicate submissions.
 4 | 
 5 | def map(k, d, v, cx):
 6 |     cx.write(k, 1)
 7 | 
 8 | def reduce(k, v, cx):
 9 |     cx.write(k, sum(v))
10 | 


--------------------------------------------------------------------------------
/mapreduce/examples/v2/trivial.py:
--------------------------------------------------------------------------------
1 | def map(key, dims, value, context):
2 |     context.write(key[0:3], 1)
3 | 
4 | def reduce(key, values, context):
5 |     context.write(key, sum([int(v) for v in values]))
6 | 


--------------------------------------------------------------------------------
/mapreduce/experiments/experiments.py:
--------------------------------------------------------------------------------
 1 | # Experiments export
 2 | import simplejson as json
 3 | import traceback
 4 | import sys
 5 | import urllib
 6 | 
 7 | def map(k, d, v, cx):
 8 |     [reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date] = d
 9 |     if appName != "Firefox":
10 |         print >>sys.stderr, "Got non-Firefox appName", appName
11 |         return
12 | 
13 |     cx.write(("Totals", appUpdateChannel, appVersion), 1)
14 |     process = False
15 |     if v.find("EXPERIMENT") != -1:
16 |         process = True
17 |     elif v.find("activeExperiment") != -1:
18 |         process = True
19 | 
20 |     if not process:
21 |         return
22 | 
23 |     try:
24 |         j = json.loads(v)
25 |         for item in j.get("log", []):
26 |             entrytype = item[0]
27 |             if entrytype == "EXPERIMENT_ACTIVATION":
28 |                 cx.write(("EXPERIMENT_ACTIVATION",
29 |                           appUpdateChannel,
30 |                           appVersion) + tuple(item[2:]), 1)
31 |             elif entrytype == "EXPERIMENT_TERMINATION":
32 |                 cx.write(("EXPERIMENT_TERMINATION",
33 |                           appUpdateChannel,
34 |                           appVersion) + tuple(item[2:]), 1)
35 | 
36 |         info = j.get("info", {})
37 |         active = info.get("activeExperiment", None)
38 |         if active is not None:
39 |             activeBranch = info.get("activeExperimentBranch", None)
40 |             cx.write(("ACTIVE", appUpdateChannel, appVersion, active, activeBranch), 1)
41 | 
42 |     except Exception as e:
43 |         print >>sys.stderr, "Error during map: ", e
44 |         cx.write(("Error",), "%s: %s\n%s" % (e, d, traceback.format_exc()))
45 | 
46 | def reduce(k, v, cx):
47 |     if k[0] == "Error":
48 |         cx.writecsv(("Error", v))
49 |     else:
50 |         cx.writecsv(list(k) + [sum(v)])
51 | 


--------------------------------------------------------------------------------
/mapreduce/experiments/filter_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["saved-session"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": "Firefox"
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": "*"
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |       "allowed_values": "*"
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": ["__TARGET_DATE__"]
27 |     }
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/mapreduce/experiments/postprocess.py:
--------------------------------------------------------------------------------
 1 | import sys, os, csv
 2 | from collections import defaultdict
 3 | import gzip
 4 | import simplejson as json
 5 | 
 6 | infile, outpattern = sys.argv[1:]
 7 | 
 8 | class Experiment(object):
 9 |     def __init__(self):
10 |         self.activeBranches = defaultdict(lambda: 0)
11 |         self.activations = defaultdict(lambda: 0)
12 |         self.terminations = defaultdict(lambda: 0)
13 | 
14 | class Channel(object):
15 |     def __init__(self):
16 |         self.total = 0
17 |         self.experiments = defaultdict(Experiment)
18 | 
19 |     def addTotal(self, c):
20 |         self.total += c
21 | 
22 |     def addActive(self, id, branch, c):
23 |         self.experiments[id].activeBranches[branch] += c
24 | 
25 |     def addActivation(self, id, data, c):
26 |         self.experiments[id].activations[tuple(data)] += c
27 | 
28 |     def addTermination(self, id, data, c):
29 |         self.experiments[id].terminations[tuple(data)] += c
30 | 
31 | channels = defaultdict(lambda: Channel())
32 | 
33 | errors = []
34 | 
35 | lines = csv.reader(open(infile))
36 | for line in lines:
37 |     entrytype = line[0]
38 |     if entrytype == "Error":
39 |         errors.append(line[1])
40 |         continue
41 | 
42 |     if entrytype == "Totals":
43 |         channel, version, count = line[1:]
44 |         count = int(count)
45 |         channels[channel].addTotal(count)
46 |     elif entrytype == "EXPERIMENT_ACTIVATION":
47 |         channel, version, reason, id = line[1:5]
48 |         data = line[5:-1]
49 |         count = int(line[-1])
50 |         channels[channel].addActivation(id, [reason] + data, count)
51 |     elif entrytype == "EXPERIMENT_TERMINATION":
52 |         channel, version, reason, id = line[1:5]
53 |         data = line[5:-1]
54 |         count = int(line[-1])
55 |         channels[channel].addTermination(id, [reason] + data, count)
56 |     elif entrytype == "ACTIVE":
57 |         channel, version, id, branch, count = line[1:]
58 |         count = int(count)
59 |         channels[channel].addActive(id, branch, count)
60 |     else:
61 |         raise ValueError("Unexpected data key, line %i: %s" % (lines.line_num, entrytype))
62 | 
63 | if len(errors):
64 |     errorfd = gzip.open("%s-errors.txt.gz" % (outpattern,), "wb")
65 |     for e in errors:
66 |         print >>errorfd, e
67 |     errorfd.close()
68 | 
69 | channels = channels.items()
70 | channels.sort(key=lambda i: i[1].total, reverse=True)
71 | 
72 | for cname, channel in channels:
73 |     d = {
74 |         "total": channel.total,
75 |         "experiments": {},
76 |     }
77 |     for id, experiment in channel.experiments.items():
78 |         d["experiments"][id] = {
79 |             "active": experiment.activeBranches,
80 |             "activations": experiment.activations.items(),
81 |             "terminations": experiment.terminations.items(),
82 |         }
83 |     fd = gzip.open("%s-%s.json.gz" % (outpattern, cname), "wb")
84 |     json.dump(d, fd)
85 |     fd.close()
86 | 


--------------------------------------------------------------------------------
/mapreduce/experiments/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OUTPUT=output
 4 | TODAY=$(date +%Y%m%d)
 5 | if [ ! -d "$OUTPUT" ]; then
 6 |     mkdir -p "$OUTPUT"
 7 | fi
 8 | 
 9 | if [ ! -d "job" ]; then
10 |     mkdir -p "job"
11 | fi
12 | if [ ! -d "work" ]; then
13 |     mkdir -p "work"
14 | fi
15 | if [ ! -d "data" ]; then
16 |     mkdir -p "data"
17 | fi
18 | 
19 | # If we have an argument, process that day.
20 | TARGET=$1
21 | if [ -z "$TARGET" ]; then
22 |   # Default to processing "yesterday"
23 |   TARGET=$(date -d 'yesterday' +%Y%m%d)
24 | fi
25 | 
26 | echo "Today is $TODAY, and we're gathering experiment data for $TARGET"
27 | sed -r "s/__TARGET_DATE__/$TARGET/" filter_template.json > filter.json
28 | 
29 | BASE=$(pwd)
30 | FINAL_DATA_FILE=$BASE/$OUTPUT/experiments$TARGET
31 | RAW_DATA_FILE=$BASE/data.csv
32 | cd ~/telemetry-server
33 | echo "Starting the experiment export for $TARGET"
34 | python -u -m mapreduce.job $BASE/experiments.py \
35 |   --num-mappers 16 \
36 |   --num-reducers 4 \
37 |   --input-filter $BASE/filter.json \
38 |   --data-dir $BASE/data \
39 |   --work-dir $BASE/work \
40 |   --output $RAW_DATA_FILE \
41 |   --bucket telemetry-published-v2
42 | 
43 | echo "Mapreduce job exited with code: $?"
44 | 
45 | cd -
46 | 
47 | grep -e "^Error," $RAW_DATA_FILE
48 | echo "End of error lines."
49 | 
50 | echo "Adding header line and removing error lines..."
51 | python postprocess.py $RAW_DATA_FILE $FINAL_DATA_FILE
52 | echo "Removing temp file"
53 | rm $RAW_DATA_FILE
54 | echo "Listing:"
55 | ls -l $BASE/$OUTPUT/
56 | echo "Done!"
57 | 


--------------------------------------------------------------------------------
/mapreduce/flash/csv_header.txt:
--------------------------------------------------------------------------------
1 | appName,appVersion,appUpdateChannel,os,osVersion,flashVersion,count
2 | 


--------------------------------------------------------------------------------
/mapreduce/flash/filter_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["saved-session"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": ["Firefox","Fennec","OTHER"]
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": "*"
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |       "allowed_values": "*"
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": ["__TARGET_DATE__"]
27 |     }
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/mapreduce/flash/flash_versions.py:
--------------------------------------------------------------------------------
 1 | # Flash Versions export, ported from:
 2 | #   https://github.com/mozilla-metrics/telemetry-toolbox
 3 | import simplejson as json
 4 | import traceback
 5 | 
 6 | def map(k, d, v, cx):
 7 |     try:
 8 |         j = json.loads(v)
 9 |         info = j.get("info", {})
10 |         if "OS" not in info:
11 |             return
12 |         if "appName" not in info:
13 |             return
14 | 
15 |         os = info["OS"]
16 |         appName = info["appName"]
17 | 
18 |         # Keep [Metro]Firefox documents on windows only
19 |         if appName == "Firefox" or appName == "MetroFirefox":
20 |             if os != "WINNT":
21 |                 return
22 |         # Also keep all Fennec documents.
23 |         elif appName != "Fennec":
24 |             return
25 | 
26 |         out_dims = [appName]
27 |         for f in ["appVersion", "appUpdateChannel"]:
28 |             out_dims.append(info.get(f, "NA"))
29 |         out_dims.append(os)
30 |         for f in ["version", "flashVersion"]:
31 |             out_dims.append(info.get(f, "NA"))
32 | 
33 |         cx.write(",".join([str(i) for i in out_dims]), 1)
34 |     except Exception as e:
35 |         cx.write(",".join(["Error", str(e), traceback.format_exc()] + d), 1)
36 | 
37 | def setup_reduce(cx):
38 |     cx.field_separator = ","
39 | 
40 | def reduce(k, v, cx):
41 |     cx.write(k, sum(v))
42 | 
43 | combine = reduce
44 | 


--------------------------------------------------------------------------------
/mapreduce/flash/package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | VERSION=0.1
 3 | NAME=flash_versions
 4 | TARBALL=${NAME}-$VERSION.tar.gz
 5 | 
 6 | if [ -f "$TARBALL" ]; then
 7 |     rm -v "$TARBALL"
 8 | fi
 9 | tar czvf "$TARBALL" \
10 |         csv_header.txt \
11 |         filter_template.json \
12 |         flash_versions.py \
13 |         run.sh
14 | 
15 | S3PATH=s3://telemetry-analysis-code/$NAME/$TARBALL
16 | 
17 | echo "Packaged $NAME code as $TARBALL"
18 | if [ ! -z "$(which aws)" ]; then
19 |     aws s3 cp $TARBALL $S3PATH
20 |     echo "Code successfully uploaded to S3"
21 | else
22 |     echo "AWS CLI not found - you should manually upload to $S3PATH"
23 | fi
24 | 


--------------------------------------------------------------------------------
/mapreduce/flash/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OUTPUT=output
 4 | TODAY=$(date +%Y%m%d)
 5 | if [ ! -d "$OUTPUT" ]; then
 6 |     mkdir -p "$OUTPUT"
 7 | fi
 8 | 
 9 | if [ ! -d "job" ]; then
10 |     mkdir -p "job"
11 | fi
12 | if [ ! -d "work" ]; then
13 |     mkdir -p "work"
14 | fi
15 | if [ ! -d "data" ]; then
16 |     mkdir -p "data"
17 | fi
18 | 
19 | # If we have an argument, process that day.
20 | TARGET=$1
21 | if [ -z "$TARGET" ]; then
22 |   # Default to processing "yesterday"
23 |   TARGET=$(date -d 'yesterday' +%Y%m%d)
24 | fi
25 | 
26 | echo "Today is $TODAY, and we're gathering flash versions for $TARGET"
27 | sed -r "s/__TARGET_DATE__/$TARGET/" filter_template.json > filter_flash.json
28 | 
29 | BASE=$(pwd)
30 | cd ~/telemetry-server
31 | echo "Starting the flash versions export for $TARGET"
32 | python -u -m mapreduce.job $BASE/flash_versions.py \
33 |   --num-mappers 16 \
34 |   --input-filter $BASE/filter_flash.json \
35 |   --data-dir $BASE/data \
36 |   --work-dir $BASE/work \
37 |   --output $BASE/$OUTPUT/flash_versions$TARGET.csv.tmp \
38 |   --bucket telemetry-published-v2
39 | 
40 | echo "Mapreduce job exited with code: $?"
41 | 
42 | cd -
43 | echo "Looking for 'error' lines:"
44 | grep -e "^Error," $OUTPUT/flash_versions$TARGET.csv.tmp
45 | echo "End of error lines."
46 | 
47 | echo "Adding header line and removing error lines..."
48 | cp csv_header.txt $OUTPUT/flash_versions$TARGET.csv
49 | grep -ve "^Error," $OUTPUT/flash_versions$TARGET.csv.tmp >> $OUTPUT/flash_versions$TARGET.csv
50 | echo "Removing temp file"
51 | rm $OUTPUT/flash_versions$TARGET.csv.tmp
52 | echo "Compressing output"
53 | gzip $OUTPUT/flash_versions$TARGET.csv
54 | echo "Done!"
55 | 


--------------------------------------------------------------------------------
/mapreduce/fxosping/csv_header.txt:
--------------------------------------------------------------------------------
1 | submission_date,os,software,time_to_ping,screen_width,screen_height,pixel_ratio,locale,hardware,model,firmware_revision,update_channel,icc_mnc,icc_mcc,icc_spn,network_mnc,network_mcc,network_operator,geo_country
2 | 


--------------------------------------------------------------------------------
/mapreduce/fxosping/filter_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["ftu"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": ["FirefoxOS"]
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": "*"
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |      "allowed_values": "*"
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": __TARGET_DATE__
27 |     }
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/mapreduce/fxosping/fxosping.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | def map(key, dims, value, context):
 4 |     data = json.loads(value)
 5 |     reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = dims
 6 | 
 7 |     def dataval(key):
 8 |         return data.get(key, 'unknown')
 9 | 
10 |     def strval(d, key):
11 |         if not d:
12 |             return 'unknown'
13 |         return d.get(key, 'unknown') or 'unknown'
14 | 
15 |     hours = -1
16 |     time_to_ping = 'unknown'
17 |     if 'pingTime' in data and 'activationTime' in data:
18 |         # Time to ping in hours
19 |         hours = float(int(data['pingTime']) - int(data['activationTime'])) / (60 * 60 * 1000)
20 |         time_to_ping = '%d' % round(hours)
21 | 
22 |     context.write(key, submission_date)
23 |     context.write(key, strval(data, 'deviceinfo.os'))
24 |     context.write(key, strval(data, 'deviceinfo.software'))
25 |     context.write(key, time_to_ping)
26 |     context.write(key, dataval('screenWidth'))
27 |     context.write(key, dataval('screenHeight'))
28 |     context.write(key, dataval('devicePixelRatio'))
29 |     context.write(key, strval(data, 'locale'))
30 |     context.write(key, strval(data, 'deviceinfo.hardware'))
31 |     context.write(key, strval(data, 'deviceinfo.product_model'))
32 |     context.write(key, strval(data, 'deviceinfo.firmware_revision'))
33 |     context.write(key, appUpdateChannel)
34 | 
35 |     icc = data.get('icc')
36 |     context.write(key, strval(icc, 'mnc'))
37 |     context.write(key, strval(icc, 'mcc'))
38 |     context.write(key, strval(icc, 'spn'))
39 | 
40 |     network = data.get('network')
41 |     context.write(key, strval(network, 'mnc'))
42 |     context.write(key, strval(network, 'mcc'))
43 |     context.write(key, strval(network, 'operator'))
44 | 
45 |     info = data.get('info')
46 |     context.write(key, strval(info, 'geoCountry'))
47 | 
48 | def setup_reduce(context):
49 |     context.field_separator = ','
50 | 
51 | def reduce(key, values, context):
52 |     context.writecsv(values)
53 | 


--------------------------------------------------------------------------------
/mapreduce/fxosping/package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | VERSION=0.1
 3 | NAME=fxosping
 4 | TARBALL=${NAME}-$VERSION.tar.gz
 5 | BASE=$(pwd)
 6 | THIS_DIR=$(cd "`dirname "$0"`"; pwd)
 7 | 
 8 | if [ -f "$TARBALL" ]; then
 9 |     rm -v "$TARBALL"
10 | fi
11 | 
12 | cd "$THIS_DIR"
13 | tar czvf "$BASE/$TARBALL" \
14 |         fxosping.py \
15 |         filter_template.json \
16 |         run.sh
17 | 
18 | S3PATH=s3://telemetry-analysis-code/$NAME/$TARBALL
19 | 
20 | echo "Packaged $NAME code as $TARBALL"
21 | if [ ! -z "$(which aws)" ]; then
22 |     aws s3 cp $TARBALL $S3PATH
23 |     echo "Code successfully uploaded to S3"
24 | else
25 |     echo "AWS CLI not found - you should manually upload to $S3PATH"
26 | fi
27 | 


--------------------------------------------------------------------------------
/mapreduce/fxosping/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BASE=$(pwd)
 4 | THIS_DIR=$(cd "`dirname "$0"`"; pwd)
 5 | TELEMETRY_SERVER_DIR=$(cd "$THIS_DIR/../.."; pwd)
 6 | if [ ! -d "$TELEMETRY_SERVER_DIR/mapreduce" ]; then
 7 |     TELEMETRY_SERVER_DIR=$HOME/telemetry-server
 8 | fi
 9 | 
10 | OUTPUT=${OUTPUT:-output}
11 | TODAY=$(date +%Y%m%d)
12 | 
13 | if [ ! -d "$OUTPUT" ]; then
14 |     mkdir -p "$OUTPUT"
15 | fi
16 | 
17 | if [ ! -d "job" ]; then
18 |     mkdir -p "job"
19 | fi
20 | 
21 | if [ ! -d "work" ]; then
22 |     mkdir -p "work"
23 | fi
24 | 
25 | if [ ! -d "data" ]; then
26 |     mkdir -p "data"
27 | fi
28 | 
29 | # If we have an argument, process that day.
30 | TARGET=$1
31 | if [ -z "$TARGET" ]; then
32 |     # Default to processing "yesterday"
33 |     TARGET=$(date -d 'yesterday' +%Y%m%d)
34 | fi
35 | 
36 | if [ "$TARGET" = "all" ]; then
37 |     TARGET_DATE="\"*\""
38 | else
39 |     TARGET_DATE="[\"$TARGET\"]"
40 | fi
41 | 
42 | echo "Today is $TODAY, and we're gathering fxosping data for '$TARGET'"
43 | 
44 | sed -r "s/__TARGET_DATE__/$TARGET_DATE/" \
45 |        "$THIS_DIR/filter_template.json" > "$THIS_DIR/filter.json"
46 | 
47 | cd "$TELEMETRY_SERVER_DIR"
48 | 
49 | OUTPUT_FILE=$BASE/$OUTPUT/fxosping_$TARGET.csv
50 | TMP_OUTPUT_FILE=${OUTPUT_FILE}.tmp
51 | 
52 | echo "Starting fxosping export for $TARGET"
53 | python -m mapreduce.job "$THIS_DIR/fxosping.py" \
54 |    --input-filter "$THIS_DIR/filter.json" \
55 |    --num-mappers 16 \
56 |    --num-reducers 4 \
57 |    --data-dir "$BASE/data" \
58 |    --work-dir "$BASE/work" \
59 |    --output "$TMP_OUTPUT_FILE" \
60 |    --bucket "telemetry-published-v2"
61 | 
62 | echo "Mapreduce job exited with code: $?"
63 | 
64 | echo "Adding header line"
65 | cp "$THIS_DIR/csv_header.txt" "$OUTPUT_FILE"
66 | cat "$TMP_OUTPUT_FILE" >> "$OUTPUT_FILE"
67 | 
68 | echo "Removing temp file"
69 | rm "$TMP_OUTPUT_FILE"
70 | 
71 | cd "$BASE"
72 | echo "Compressing output"
73 | gzip -f "$OUTPUT_FILE"
74 | 
75 | echo "Done!"
76 | 


--------------------------------------------------------------------------------
/mapreduce/loop_failure_summary/failures_by_type.py:
--------------------------------------------------------------------------------
 1 | import simplejson as json
 2 | 
 3 | def map(k, d, v, cx):
 4 |     reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = d
 5 |     try:
 6 |         j = json.loads(v)
 7 | 
 8 |         # Filter just the ice failure reports:
 9 |         if "report" in j and j["report"] == "ice failure":
10 |             cx.write(k, (submission_date, j.get("connectionstate", "UNKNOWN"), v))
11 |     except Exception as e:
12 |         cx.write("ERROR", str(e))
13 | 
14 | def reduce(k, v, cx):
15 |     if k == "ERROR":
16 |         for err in v:
17 |             cx.write(k, err)
18 |     else:
19 |         # data contains duplicates, so we just output the first record for each
20 |         # key.
21 |         submission_date, connectionstate, payload = v[0]
22 |         cx.write(submission_date, "\t".join((connectionstate, payload)))
23 | 


--------------------------------------------------------------------------------
/mapreduce/loop_failure_summary/filter_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["loop"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |        "allowed_values": "*"
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": "*"
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |       "allowed_values": "*"
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": ["__TARGET_DATE__"]
27 |     }
28 |   ]
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/mapreduce/loop_failure_summary/header.txt:
--------------------------------------------------------------------------------
1 | submission_date	failure_type	payload
2 | 


--------------------------------------------------------------------------------
/mapreduce/loop_failure_summary/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Loop Telemetry
 3 | 
 4 | OUTPUT=output
 5 | TODAY=$(date +%Y%m%d)
 6 | 
 7 | JOB_DIR=$(pwd)
 8 | BASE=/mnt/telemetry
 9 | cd $BASE
10 | if [ ! -d "$OUTPUT" ]; then
11 |     mkdir -p "$OUTPUT"
12 | fi
13 | if [ ! -d "work" ]; then
14 |     mkdir -p "work"
15 | fi
16 | 
17 | if [ ! -d "data" ]; then
18 |     mkdir -p "data"
19 | fi
20 | 
21 | cd $JOB_DIR
22 | 
23 | TARGET=$1
24 | if [ -z "$TARGET" ]; then
25 |   # Default to processing "yesterday"
26 |   TARGET=$(date -d 'yesterday' +%Y%m%d)
27 | fi
28 | 
29 | echo "Today is $TODAY | Gathering data for $TARGET"
30 | sed -r "s/__TARGET_DATE__/$TARGET/" filter_template.json > filter.json
31 | 
32 | FINAL_DATA_FILE=$BASE/$OUTPUT/$TARGET.tsv
33 | RAW_DATA_FILE=${FINAL_DATA_FILE}.tmp
34 | cd ~/telemetry-server
35 | echo "Starting the export for data on $TARGET"
36 | echo "running $BASE/failures_by_type.py"
37 | python -u -m mapreduce.job $JOB_DIR/failures_by_type.py \
38 |   --num-mappers 16 \
39 |   --num-reducers 1 \
40 |   --input-filter $JOB_DIR/filter.json \
41 |   --data-dir $BASE/data \
42 |   --work-dir $BASE/work \
43 |   --output $RAW_DATA_FILE \
44 |   --bucket telemetry-published-v2
45 | 
46 | cat $JOB_DIR/header.txt > $FINAL_DATA_FILE
47 | cat $RAW_DATA_FILE >> $FINAL_DATA_FILE
48 | rm $RAW_DATA_FILE
49 | 
50 | aws s3 cp s3://telemetry-private-analysis-2/loop_failures/data/failures_by_type.json $JOB_DIR/failures_by_type.json
51 | if [ -f "$JOB_DIR/failures_by_type.json" ]; then
52 |   # back up the existing one
53 |   cp $JOB_DIR/failures_by_type.json $BASE/$OUTPUT/failures_by_type.json.prev
54 | else
55 |   # create an empty one.
56 |   touch $JOB_DIR/failures_by_type.json
57 | fi
58 | python $JOB_DIR/summarize.py -i $FINAL_DATA_FILE -o $BASE/$OUTPUT/$TARGET.summary.json -c $JOB_DIR/failures_by_type.json -O $BASE/$OUTPUT/failures_by_type.json
59 | gzip $FINAL_DATA_FILE
60 | 


--------------------------------------------------------------------------------
/mapreduce/loop_failure_summary/summarize.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import fileinput
 3 | import json
 4 | import sys
 5 | 
 6 | def main():
 7 |     parser = argparse.ArgumentParser(description='Summarize daily loop failures.')
 8 |     parser.add_argument("-i", "--input-file", help="Filename to read from", required=True, type=file)
 9 |     parser.add_argument("-o", "--summary-output", help="Filename to save day's data", required=True, type=argparse.FileType('w'))
10 |     parser.add_argument("-c", "--combined-input", help="Filename to read combined daily data", type=file)
11 |     parser.add_argument("-O", "--combined-output", help="Filename to save combined daily data", required=True, type=argparse.FileType('w'))
12 |     args = parser.parse_args()
13 | 
14 |     headers = None
15 |     date_idx = -1
16 |     err_idx = -1
17 |     date_map = {}
18 |     for line in args.input_file:
19 |         fields = line.split("\t")
20 |         if headers is None:
21 |             headers = fields
22 |             try:
23 |                 date_idx = headers.index("submission_date")
24 |                 err_idx = headers.index("failure_type")
25 |             except ValueError as e:
26 |                 print "Error: required field missing. We need 'submission_date' " \
27 |                       "and 'failure_type' to generate a summary"
28 |                 return 2
29 |         else:
30 |             submission_date = fields[date_idx]
31 |             failure_type = fields[err_idx]
32 |             if submission_date not in date_map:
33 |                 date_map[submission_date] = {}
34 | 
35 |             if failure_type not in date_map[submission_date]:
36 |                 date_map[submission_date][failure_type] = 1
37 |             else:
38 |                 date_map[submission_date][failure_type] += 1
39 | 
40 |     json.dump(date_map, args.summary_output)
41 |     try:
42 |         combined = json.load(args.combined_input)
43 |     except:
44 |         combined = []
45 | 
46 |     current_index = 0
47 |     # Insert each date into the correct spot in the array.
48 |     for d in sorted(date_map.keys()):
49 |         date_map[d]["date"] = d
50 |         while current_index < len(combined) and d > combined[current_index]["date"]:
51 |             current_index += 1
52 | 
53 |         # if the date is already there, overwrite with new values
54 |         if len(combined) > current_index and combined[current_index]["date"] == d:
55 |             for k in date_map[d].keys():
56 |                 combined[current_index][k] = date_map[d][k]
57 |         else:
58 |             combined.insert(current_index, date_map[d])
59 |         # Output last 180 days
60 |         json.dump(combined[-180:], args.combined_output)
61 |     return 0
62 | 
63 | if __name__ == "__main__":
64 |     sys.exit(main())
65 | 


--------------------------------------------------------------------------------
/mapreduce/mainthreadio/csv_header.txt:
--------------------------------------------------------------------------------
1 | app_name,interval,filename,submission_count,median_time,median_count
2 | 


--------------------------------------------------------------------------------
/mapreduce/mainthreadio/filter_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["saved-session"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": "*"
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": "nightly"
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |       "allowed_values": {"min": "__BID_BEGIN__", "max": "__BID_END__999999"}
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": {"min": "__BEGIN__", "max": "__END__999999"}
27 |     }
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/mapreduce/mainthreadio/mainthreadio.py:
--------------------------------------------------------------------------------
 1 | import simplejson as json
 2 | import numpy
 3 | import io
 4 | import csv
 5 | from string import maketrans
 6 | 
 7 | def clean(s):
 8 |     return normalize(s).translate(None, ",")
 9 | 
10 | def normalize(s):
11 |     if type(s) == unicode:
12 |         return s.encode('utf8', 'ignore')
13 |     else:
14 |         return str(s)
15 | 
16 | def safe_key(pieces):
17 |     output = io.BytesIO()
18 |     writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
19 |     writer.writerow(pieces)
20 |     return output.getvalue().strip()
21 | 
22 | def map(k, d, v, cx):
23 |     global n_pings
24 | 
25 |     if "fileIOReports" not in v or '"fileIOReports":null' in v:
26 |         return
27 | 
28 |     parsed = json.loads(v)
29 |     reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = d
30 | 
31 |     if not "fileIOReports" in parsed:
32 |         return
33 | 
34 |     if not parsed["fileIOReports"]:
35 |         return
36 | 
37 |     startup_sub = False
38 |     execution_sub = False
39 |     shutdown_sub = False
40 | 
41 |     for f, arr in parsed["fileIOReports"].iteritems():
42 |         if len(arr) != 3: # Don't support the old format
43 |             continue
44 | 
45 |         if arr[0] is not None:
46 |             cx.write(safe_key([appName, "startup", clean(f)]), [arr[0][0], sum(arr[0][1:])])
47 |             if not startup_sub:
48 |                 cx.write(safe_key([appName, "startup", "TOTAL"]), [0, 0])
49 |                 startup_sub = True
50 | 
51 |         if arr[1] is not None:
52 |             cx.write(safe_key([appName, "execution", clean(f)]), [arr[1][0], sum(arr[1][1:])])
53 |             if not execution_sub:
54 |                 cx.write(safe_key([appName, "execution", "TOTAL"]), [0, 0])
55 |                 execution_sub = True
56 | 
57 |         if arr[2] is not None:
58 |             cx.write(safe_key([appName, "shutdown", clean(f)]), [arr[2][0], sum(arr[2][1:])])
59 |             if not shutdown_sub:
60 |                 cx.write(safe_key([appName, "shutdown", "TOTAL"]), [0, 0])
61 |                 shutdown_sub = True
62 | 
63 | def setup_reduce(cx):
64 |     cx.field_separator = ","
65 | 
66 | def reduce(k, v, cx):
67 |     totals = []
68 |     counts = []
69 | 
70 |     if len(v) > 100:
71 |         sup = min(len(v), 10000)
72 | 
73 |         for total, count in v[:sup]:
74 |             totals.append(total)
75 |             counts.append(count)
76 | 
77 |         # Output fields:
78 |         # app_name, interval, filename, submission_count, median_time, median_count
79 |         cx.write(k, ",".join([str(len(v)), str(numpy.median(totals)), str(numpy.median(counts))]))
80 | 


--------------------------------------------------------------------------------
/mapreduce/mainthreadio/package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | VERSION=0.2
 3 | NAME=mainthreadio
 4 | TARBALL=${NAME}-$VERSION.tar.gz
 5 | 
 6 | if [ -f "$TARBALL" ]; then
 7 |     rm -v "$TARBALL"
 8 | fi
 9 | tar czvf "$TARBALL" run.sh
10 | 
11 | S3PATH=s3://telemetry-analysis-code/$NAME/$TARBALL
12 | 
13 | echo "Packaged $NAME code as $TARBALL"
14 | if [ ! -z "$(which aws)" ]; then
15 |     aws s3 cp $TARBALL $S3PATH
16 |     echo "Code successfully uploaded to S3"
17 | else
18 |     echo "AWS CLI not found - you should manually upload to $S3PATH"
19 | fi
20 | 


--------------------------------------------------------------------------------
/mapreduce/mainthreadio/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd $(cd -P -- "$(dirname -- "$0")" && pwd -P)
 4 | sudo apt-get --yes install python-numpy git
 5 | 
 6 | rm -rf telemetry-server
 7 | git clone https://github.com/mozilla/telemetry-server.git
 8 | cd telemetry-server/mapreduce/mainthreadio
 9 | 
10 | OUTPUT=output
11 | TODAY=$(date +%Y%m%d)
12 | if [ ! -d "$OUTPUT" ]; then
13 |     mkdir -p "$OUTPUT"
14 | fi
15 | 
16 | if [ ! -d "job" ]; then
17 |     mkdir -p "job"
18 | fi
19 | if [ ! -d "work" ]; then
20 |     mkdir -p "work"
21 | fi
22 | if [ ! -d "data" ]; then
23 |     mkdir -p "data"
24 | fi
25 | 
26 | # If we have an argument, process that week.
27 | DAYS=$1
28 | if [ -z "$DAYS" ]; then
29 |   # Default to processing "last week"
30 |   DAYS=0
31 | fi
32 | 
33 | BEGIN=$(date -d "$TODAY - $DAYS days - 1 weeks" +%Y%m%d)
34 | END=$(date -d "TODAY - $DAYS days" +%Y%m%d)
35 | BID_BEGIN=$BEGIN
36 | BID_END=$BEGIN
37 | TARGET=$BID_BEGIN
38 | 
39 | echo "Today is $TODAY, and we're gathering mainthreadio data from $BEGIN to $END for build-ids from $BID_BEGIN to $BID_END"
40 | sed -e "s/__BEGIN__/$BEGIN/" -e "s/__END__/$END/" -e "s/__BID_BEGIN__/$BID_BEGIN/" -e "s/__BID_END__/$BID_END/" filter_template.json > filter.json
41 | 
42 | BASE=$(pwd)
43 | FINAL_DATA_FILE=$BASE/$OUTPUT/buildid_$TARGET.csv
44 | RAW_DATA_FILE=${FINAL_DATA_FILE}.tmp
45 | 
46 | cd ../../
47 | echo "Starting the mainthreadio export for $TARGET"
48 | python -u -m mapreduce.job $BASE/mainthreadio.py \
49 |   --num-mappers 16 \
50 |   --num-reducers 4 \
51 |   --input-filter $BASE/filter.json \
52 |   --data-dir $BASE/data \
53 |   --work-dir $BASE/work \
54 |   --output $RAW_DATA_FILE \
55 |   --bucket telemetry-published-v2 #--data-dir $BASE/work/cache --local-only
56 | 
57 | echo "Mapreduce job exited with code: $?"
58 | 
59 | echo "Adding header line"
60 | cp $BASE/csv_header.txt $FINAL_DATA_FILE
61 | 
62 | echo "Compute summaries"
63 | python $BASE/summary.py $RAW_DATA_FILE
64 | 
65 | echo "Copying iacomus configuration"
66 | cp $BASE/iacomus.json $BASE/$OUTPUT
67 | 
68 | cat $RAW_DATA_FILE >> $FINAL_DATA_FILE
69 | echo "Removing temp file"
70 | rm $RAW_DATA_FILE
71 | echo "Compressing output"
72 | gzip $FINAL_DATA_FILE
73 | echo "Done!"
74 | 


--------------------------------------------------------------------------------
/mapreduce/mainthreadio/summary.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import gzip
 3 | import os
 4 | import re
 5 | import sys
 6 | import numpy
 7 | 
 8 | APP_COLUMN=0
 9 | INTERVAL_COLUMN=1
10 | FILE_COLUMN=2
11 | SUBMISSION_COUNT_COLUMN=3
12 | MEDIAN_TIME_COLUMN=4
13 | MEDIAN_COUNT_COLUMN=5
14 | 
15 | input = sys.argv[1]
16 | rows = None
17 | totals = {}
18 | 
19 | def key(row):
20 |     return str(row[APP_COLUMN]) + str(row[INTERVAL_COLUMN])
21 | 
22 | def parse():
23 |     global rows
24 | 
25 |     with open(input) as f:
26 |         lines = f.readlines()
27 |         rows = map(lambda x: x.split(','), lines)
28 | 
29 |     for i, row in enumerate(rows[:]):
30 |         if row[FILE_COLUMN] == "TOTAL":
31 |             totals[key(row)] = row
32 |             rows.remove(row)
33 | 
34 | def normalize():
35 |     global rows
36 | 
37 |     for row in rows:
38 |         k = key(row)
39 |         row[SUBMISSION_COUNT_COLUMN] = float(row[SUBMISSION_COUNT_COLUMN]) / float(totals[k][SUBMISSION_COUNT_COLUMN])
40 | 
41 |     rows = sorted(rows, key=lambda x: x[SUBMISSION_COUNT_COLUMN], reverse=True)
42 |     for row in rows:
43 |         row[SUBMISSION_COUNT_COLUMN] = str(row[SUBMISSION_COUNT_COLUMN])
44 | 
45 | def dump():
46 |     with open(input, "w") as f:
47 |         for row in rows:
48 |             f.write(",".join(row))
49 | 
50 | parse()
51 | normalize()
52 | dump()
53 | 


--------------------------------------------------------------------------------
/mapreduce/slowsql/csv_header.txt:
--------------------------------------------------------------------------------
1 | thread_type,submission_date,app_name,app_version,app_update_channel,query,document_count,total_invocations,total_duration,median_duration
2 | 


--------------------------------------------------------------------------------
/mapreduce/slowsql/filter_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "prefix",
 6 |       "allowed_values": "telemetry-2"
 7 |     },
 8 |     {
 9 |       "field_name": "submissionDate",
10 |       "allowed_values": ["__TARGET_DATE__"]
11 |     },
12 |     {
13 |       "field_name": "sourceName",
14 |       "allowed_values": "telemetry"
15 |     },
16 |     {
17 |       "field_name": "sourceVersion",
18 |       "allowed_values": "4"
19 |     },
20 |     {
21 |       "field_name": "docType",
22 |       "allowed_values": "saved_session"
23 |     },
24 |     {
25 |       "field_name": "appName",
26 |       "allowed_values": "*"
27 |     },
28 |     {
29 |       "field_name": "appUpdateChannel",
30 |       "allowed_values": "*"
31 |     },
32 |     {
33 |       "field_name": "appVersion",
34 |       "allowed_values": "*"
35 |     },
36 |     {
37 |       "field_name": "appBuildId",
38 |       "allowed_values": "*"
39 |     }
40 |   ]
41 | }
42 | 


--------------------------------------------------------------------------------
/mapreduce/slowsql/package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | VERSION=0.4
 3 | NAME=SlowSQL
 4 | TARBALL=${NAME}-$VERSION.tar.gz
 5 | if [ ! -d "combine.py" ]; then
 6 |     echo "Fetching 'combine.py' from github..."
 7 |     wget https://github.com/mreid-moz/slowsql-dashboard/raw/master/data/combine.py
 8 | else
 9 |     echo "Using existing 'combine.py'"
10 | fi
11 | 
12 | if [ -f "$TARBALL" ]; then
13 |     rm -v "$TARBALL"
14 | fi
15 | tar czvf "$TARBALL" \
16 |         combine.py \
17 |         csv_header.txt \
18 |         filter_template.json \
19 |         run.sh \
20 |         slowsql.py
21 | 
22 | echo "Packaged $NAME code as $TARBALL"
23 | 


--------------------------------------------------------------------------------
/mapreduce/slowsql/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OUTPUT=output
 4 | NAME=SlowSQL
 5 | TODAY=$(date +%Y%m%d)
 6 | if [ ! -d "$OUTPUT" ]; then
 7 |     mkdir -p "$OUTPUT"
 8 | fi
 9 | 
10 | if [ ! -d "job" ]; then
11 |     mkdir -p "job"
12 | fi
13 | if [ ! -d "work" ]; then
14 |     mkdir -p "work"
15 | fi
16 | if [ ! -d "data" ]; then
17 |     mkdir -p "data"
18 | fi
19 | 
20 | # If we have an argument, process that day.
21 | TARGET=$1
22 | if [ -z "$TARGET" ]; then
23 |   # Default to processing "yesterday"
24 |   TARGET=$(date -d 'yesterday' +%Y%m%d)
25 | fi
26 | 
27 | echo "Today is $TODAY, and we're gathering slowsql data for $TARGET"
28 | sed -r "s/__TARGET_DATE__/$TARGET/" filter_template.json > filter.json
29 | 
30 | BASE=$(pwd)
31 | FINAL_DATA_FILE=$BASE/$OUTPUT/slowsql$TARGET.csv
32 | RAW_DATA_FILE=${FINAL_DATA_FILE}.tmp
33 | cd ~/telemetry-server
34 | echo "Starting the slowsql export for $TARGET"
35 | python -u -m mapreduce.hekajob $BASE/slowsql.py \
36 |   --delete-data \
37 |   --num-mappers 16 \
38 |   --num-reducers 4 \
39 |   --input-filter $BASE/filter.json \
40 |   --data-dir $BASE/data \
41 |   --work-dir $BASE/work \
42 |   --output $RAW_DATA_FILE \
43 |   --bucket "net-mozaws-prod-us-west-2-pipeline-data"
44 | 
45 | echo "Mapreduce job exited with code: $?"
46 | 
47 | cd -
48 | echo "Looking for 'error' lines:"
49 | grep -e "^Error," $RAW_DATA_FILE
50 | echo "End of error lines."
51 | 
52 | echo "Adding header line and removing error lines..."
53 | cp csv_header.txt $FINAL_DATA_FILE
54 | grep -ve "^Error," $RAW_DATA_FILE >> $FINAL_DATA_FILE
55 | echo "Removing temp file"
56 | rm $RAW_DATA_FILE
57 | echo "Compressing output"
58 | gzip $FINAL_DATA_FILE
59 | echo "Done!"
60 | 
61 | echo "Processing weekly data"
62 | cd $BASE
63 | if [ ! -d "weekly" ]; then
64 |     mkdir -p "weekly"
65 | fi
66 | cd weekly
67 | # Monday is day 1
68 | OFFSET=$(( $(date -d $TARGET +%u) - 1 ))
69 | MONDAY=$(date -d "$TARGET - $OFFSET days" +%Y%m%d)
70 | SUNDAY=$(date -d "$MONDAY + 6 days" +%Y%m%d)
71 | echo "For target '$TARGET', week is $MONDAY to $SUNDAY"
72 | for f in $(seq 0 6); do
73 |     DAY=$(date -d "$MONDAY + $f days" +%Y%m%d)
74 |     if [ "$DAY" -eq "$TARGET" ]; then
75 |         echo "Using local file for today ($DAY)"
76 |         cp $BASE/$OUTPUT/slowsql$DAY.csv.gz ./
77 |     else
78 |         echo "Fetching $DAY"
79 |         aws s3 cp s3://telemetry-public-analysis-2/$NAME/data/slowsql$DAY.csv.gz ./slowsql$DAY.csv.gz
80 |     fi
81 | done
82 | echo "Creating weekly data for $MONDAY to $SUNDAY"
83 | python $BASE/combine.py $BASE/$OUTPUT $MONDAY $SUNDAY
84 | echo "Created weekly output files:"
85 | ls -l $BASE/$OUTPUT/
86 | 


--------------------------------------------------------------------------------
/mongodb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/mongodb/__init__.py


--------------------------------------------------------------------------------
/mongodb/examples/osdistribution.js:
--------------------------------------------------------------------------------
1 | printjson(db.payloads.mapReduce(
2 |   function() { emit(this.info.OS, 1);},
3 |   function(key, values) { return Array.sum(values);},
4 |   {
5 |     out: { inline: 1}
6 |   }))
7 | 


--------------------------------------------------------------------------------
/monitoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/monitoring/__init__.py


--------------------------------------------------------------------------------
/monitoring/anomaly_detection/notify.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | # If stdin contains any non-whitespace data, send it as an email using SES.
 9 | 
10 | import argparse
11 | from boto.ses import connect_to_region as ses_connect
12 | import sys
13 | import traceback
14 | import simplejson as json
15 | 
16 | if __name__ == "__main__":
17 |     parser = argparse.ArgumentParser(description="Telemetry notifier")
18 |     parser.add_argument("-c", "--config", help="Configuration file", type=file)
19 |     parser.add_argument("-f", "--from-email", help="Email 'from:' address")
20 |     parser.add_argument("-t", "--to-email", help="Email 'to:' address (multiple allowed)", action="append")
21 |     parser.add_argument("-s", "--subject", help="Email Subject")
22 |     parser.add_argument("-d", "--dry-run", help="Print out what would happen instead of sending email", action="store_true")
23 |     args = parser.parse_args()
24 | 
25 |     message_body = sys.stdin.read().strip()
26 | 
27 |     if message_body == "":
28 |         # nothing to notify about.
29 |         if args.dry_run:
30 |             print "Would not have sent any mail."
31 |     else:
32 |         if args.config:
33 |             try:
34 |                 config = json.load(args.config)
35 |             except:
36 |                 traceback.print_exc()
37 |                 config = {}
38 |         else:
39 |             config = {}
40 | 
41 |         if args.from_email:
42 |             config["notify_from"] = args.from_email
43 | 
44 |         if args.to_email:
45 |             config["notify_to"] = args.to_email
46 | 
47 |         if args.subject:
48 |             config["notify_subject"] = args.subject
49 | 
50 |         if args.dry_run:
51 |             print "Here is what we would have sent:"
52 |             print "   From:", config["notify_from"]
53 |             print "     To:", config["notify_to"]
54 |             print "Subject:", config["notify_subject"]
55 |             print "   Body:", message_body
56 |         else:
57 |             ses = ses_connect('us-east-1') # only supported region!
58 |             ses.send_email(
59 |                 source          = config["notify_from"],
60 |                 subject         = config["notify_subject"],
61 |                 format          = "text",
62 |                 body            = message_body,
63 |                 to_addresses    = config["notify_to"]
64 |             )
65 | 


--------------------------------------------------------------------------------
/monitoring/expire_flash_video/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/monitoring/expire_flash_video/__init__.py


--------------------------------------------------------------------------------
/monitoring/heka/common.toml:
--------------------------------------------------------------------------------
 1 | [hekad]
 2 | maxprocs = 4
 3 | max_timer_inject = 100
 4 | max_process_duration = 1000000
 5 | 
 6 | [TCP:5565]
 7 | type = "TcpInput"
 8 | parser_type = "message.proto"
 9 | decoder = "ProtobufDecoder"
10 | address = ":5565"
11 | 	[TCP:5565.signer.telemetry_0]
12 | 	hmac_key = "TODO change on deploy" # TODO update on deploy
13 | 
14 | [TelemetrySandboxManager]
15 | type = "SandboxManagerFilter"
16 | message_signer = "telemetry"
17 | message_matcher = "Type == 'heka.control.sandbox'"
18 | max_filters = 10
19 | 
20 | [Dashboard]
21 | type = "DashboardOutput"
22 | address = ":4352"
23 | ticker_interval = 10
24 | 
25 | [TcpOutput]
26 | address = "10.250.68.186:5565"
27 | message_matcher =  "Type == 'heka.sandbox-output' && Fields[payload_type] == 'cbufd'"
28 | 


--------------------------------------------------------------------------------
/monitoring/heka/incoming_stats.toml:
--------------------------------------------------------------------------------
 1 | [TelemetryIncomingStatsInput]
 2 | type = "LogstreamerInput"
 3 | log_directory = "/mnt/telemetry/log"
 4 | file_match = 'telemetry-incoming-stats\.log'
 5 | decoder = "TelemetryIncomingStatsDecoder"
 6 | 
 7 | [TelemetryIncomingStatsDecoder]
 8 | type = "SandboxDecoder"
 9 | script_type = "lua"
10 | filename = "lua_decoders/telemetry_incoming_stats.lua"
11 | 
12 | [TelemetryStatsRecords]
13 | type = "SandboxFilter"
14 | message_matcher = "Type == 'telemetry.incoming_stats'"
15 | ticker_interval = 10
16 | script_type = "lua"
17 | filename = "lua_filters/telemetry_stats_records.lua"
18 | preserve_data = true
19 | 
20 | [TelemetryStatsBytes]
21 | type = "SandboxFilter"
22 | message_matcher = "Type == 'telemetry.incoming_stats' && Fields[channel] == 'ALL'"
23 | ticker_interval = 10
24 | script_type = "lua"
25 | filename = "lua_filters/telemetry_stats_bytes.lua"
26 | preserve_data = true
27 | 
28 | [TelemetryStatsErrors]
29 | type = "SandboxFilter"
30 | message_matcher = "Type == 'telemetry.incoming_stats' && Fields[channel] == 'ALL' && Fields[bad_records] > 0"
31 | ticker_interval = 10
32 | script_type = "lua"
33 | filename = "lua_filters/telemetry_stats_errors.lua"
34 | preserve_data = true
35 | 


--------------------------------------------------------------------------------
/monitoring/heka/lua_decoders/telemetry_server.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | sample input
 7 | ------------
 8 | {"url":"/submit/sample","duration_ms":0.547324,"code":200,"size":4819,"level":"info","message":"OK","timestamp":"2013-09-10T20:43:17.217Z"}
 9 | 
10 | Injected Heka message
11 | ---------------------
12 | Timestamp: 2013-09-10 20:43:17.216999936 +0000 UTC
13 | Type: telemetry.server
14 | Hostname: trink-x230
15 | Pid: 0
16 | UUID: 2be3ed98-89e8-4bd0-a7c4-9aebe8747a8b
17 | Logger: TelemetryServerInput
18 | Payload:
19 | EnvVersion:
20 | Severity: 6
21 | Fields: [
22 | name:"message" value_string:"OK"
23 | name:"code" value_type:DOUBLE value_double:200
24 | name:"url" value_string:"/submit/sample"
25 | name:"duration" value_type:DOUBLE representation:"ms" value_double:0.547324
26 | name:"size" value_type:DOUBLE representation:"B" value_double:4819 ]
27 | --]]
28 | 
29 | require "cjson"
30 | 
31 | local dt = require "date_time"
32 | local syslog = require "syslog"
33 | 
34 | local metadata = {
35 |     duration = {value=0, representation="ms"},
36 |     size = {value=0, representation="B"},
37 | }
38 | 
39 | local msg = {
40 |     Timestamp = nil,
41 |     Type = "telemetry.server",
42 |     Severity = nil,
43 |     Fields = nil
44 | }
45 | 
46 | function process_message()
47 |     json = cjson.decode(read_message("Payload"))
48 |     if not json then return -1 end
49 | 
50 |     local t = lpeg.match(dt.rfc3339, json.timestamp)
51 |     if not t then return -1 end
52 |     msg.Timestamp = dt.time_to_ns(t)
53 |     json.timestamp = nil
54 | 
55 |     msg.Severity = lpeg.match(syslog.severity, json.level)
56 |     json.level = nil
57 | 
58 |     metadata.duration.value = json.duration_ms
59 |     json.duration = metadata.duration
60 |     json.duration_ms = nil
61 | 
62 |     metadata.size.value = json.size
63 |     json.size = metadata.size
64 | 
65 |     msg.Fields = json
66 |     if not pcall(inject_message, msg) then return -1 end
67 | 
68 |     return 0
69 | end
70 | 


--------------------------------------------------------------------------------
/monitoring/heka/lua_filters/telemetry_channel_metrics.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | require "circular_buffer"
 6 | require "string"
 7 | 
 8 | local rows = read_config("rows") or 1440
 9 | local sec_per_row = read_config("sec_per_row") or 60
10 | local REQUESTS    = 1
11 | local TOTAL_SIZE  = 2
12 | 
13 | channels = {}
14 | 
15 | local function add_channel(channel)
16 |     local c = circular_buffer.new(rows, 2, sec_per_row, true)
17 |     c:set_header(REQUESTS, "Requests")
18 |     c:set_header(TOTAL_SIZE, "Total Size", "KiB")
19 |     channels[channel] = c
20 |     return c
21 | end
22 | 
23 | all = add_channel("ALL")
24 | 
25 | function process_message ()
26 |     local ts = read_message("Timestamp")
27 |     if not all:add(ts, REQUESTS, 1) then return 0 end -- outside the buffer
28 | 
29 |     local rs = read_message("Fields[size]")
30 |     if rs then
31 |         rs = rs / 1024
32 |     else
33 |         rs = 0
34 |     end
35 |     all:add(ts, TOTAL_SIZE, rs)
36 | 
37 |     local url = read_message("Fields[url]")
38 |     local channel = url:match("^/submit/telemetry/[^/]+/[^/]+/[^/]+/[^/]+/([^/]+)")
39 |     if not channel then return 0 end
40 |     if channel ~= "release" and channel ~= "beta" and channel ~= "aurora" and channel ~= "nightly" then
41 |         channel = "other"
42 |     end
43 | 
44 |     local c = channels[channel]
45 |     if not c then
46 |         c = add_channel(channel)
47 |     end
48 |     c:add(ts, REQUESTS, 1)
49 |     c:add(ts, TOTAL_SIZE, rs)
50 | 
51 |     return 0
52 | end
53 | 
54 | function timer_event(ns)
55 |     for k, v in pairs(channels) do
56 |         inject_message(v:format("cbuf"), k)
57 |         inject_message(v:format("cbufd"), k)
58 |     end
59 | end
60 | 


--------------------------------------------------------------------------------
/monitoring/heka/lua_filters/telemetry_server_metrics.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | require "circular_buffer"
 6 | 
 7 | local rows        = 1440
 8 | local sec_per_row = 60
 9 | 
10 | request = circular_buffer.new(rows, 4, sec_per_row, true)
11 | local SUCCESS       = request:set_header(1, "Success"     , "count")
12 | local FAILURE       = request:set_header(2, "Failure"     , "count")
13 | local REQUEST_SIZE  = request:set_header(3, "Request Size", "B")
14 | local REQUEST_TIME  = request:set_header(4, "Request Time", "ms")
15 | 
16 | function process_message ()
17 |     local ts = read_message("Timestamp")
18 |     if not request:add(ts, REQUEST_TIME, read_message("Fields[duration]")) then
19 |         return 0 -- outside the buffer
20 |     end
21 | 
22 |     request:add(ts, REQUEST_SIZE, read_message("Fields[size]"))
23 | 
24 |     if 200 == read_message("Fields[code]") then
25 |         request:add(ts, SUCCESS, 1)
26 |     else
27 |         request:add(ts, FAILURE, 1)
28 |     end
29 | 
30 |     return 0
31 | end
32 | 
33 | function timer_event(ns)
34 |     local title = "Request Statistics"
35 |     inject_message(request:format("cbuf"), title)
36 |     inject_message(request:format("cbufd"), title)
37 | end
38 | 


--------------------------------------------------------------------------------
/monitoring/heka/lua_filters/telemetry_stats_bytes.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | require "circular_buffer"
 6 | require "string"
 7 | 
 8 | local title = "Throughput"
 9 | local rows = read_config("rows") or 1440
10 | local sec_per_row = read_config("sec_per_row") or 60
11 | local BYTES_READ            = 1
12 | local BYTES_WRITTEN         = 2
13 | local BYTES_UNCOMPRESSED    = 3
14 | 
15 | bytes = circular_buffer.new(rows, 3, sec_per_row, true)
16 | bytes:set_header(BYTES_READ         , "Bytes Read"          , "B")
17 | bytes:set_header(BYTES_WRITTEN      , "Bytes Written"       , "B")
18 | bytes:set_header(BYTES_UNCOMPRESSED , "Bytes Uncompressed"  , "B")
19 | 
20 | function process_message ()
21 |     local ts = read_message("Timestamp")
22 |     if not bytes:add(ts, BYTES_READ, read_message("Fields[bytes_read]")) then
23 |         return 0 -- outside the buffer
24 |     end
25 | 
26 |     bytes:add(ts, BYTES_WRITTEN, read_message("Fields[bytes_written]"))
27 |     bytes:add(ts, BYTES_UNCOMPRESSED, read_message("Fields[bytes_uncompressed]"))
28 | 
29 |     return 0
30 | end
31 | 
32 | function timer_event(ns)
33 |     inject_message(bytes:format("cbuf"), title)
34 |     inject_message(bytes:format("cbufd"), title)
35 | end
36 | 


--------------------------------------------------------------------------------
/monitoring/heka/lua_filters/telemetry_stats_errors.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | require "circular_buffer"
 6 | require "string"
 7 | 
 8 | local rows = read_config("rows") or 1440
 9 | local sec_per_row = read_config("sec_per_row") or 60
10 | local TOTAL_ERRORS = 1
11 | 
12 | errors = {}
13 | 
14 | local function add_error(name)
15 |     local e = circular_buffer.new(rows, 1, sec_per_row, true)
16 |     e:set_header(TOTAL_ERRORS, "Total Errors")
17 |     errors[name] = e
18 |     return e
19 | end
20 | 
21 | local f = {type = 0, name = "", value = 0, representation = "", count = 0, key = ""}
22 | 
23 | function process_message ()
24 |     local ts = read_message("Timestamp")
25 |     while true do
26 |         f.type, f.name, f.value, f.representation, f.count = read_next_field()
27 |         if not f.type then break end
28 | 
29 |         local name = f.name:match("^bad_records\.(%S+)")
30 |         if name then
31 |             local e = errors[name]
32 |             if not e then
33 |                 e = add_error(name)
34 |             end
35 |             if not e:add(ts, TOTAL_ERRORS, f.value) then break end -- outside the buffer
36 |         end
37 |     end
38 | 
39 |     return 0
40 | end
41 | 
42 | function timer_event(ns)
43 |     for k, v in pairs(errors) do
44 |         inject_message(v:format("cbuf"), k)
45 |         inject_message(v:format("cbufd"), k)
46 |     end
47 | end
48 | 


--------------------------------------------------------------------------------
/monitoring/heka/lua_filters/telemetry_stats_records.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | require "circular_buffer"
 6 | require "string"
 7 | 
 8 | local rows = read_config("rows") or 1440
 9 | local sec_per_row = read_config("sec_per_row") or 60
10 | local RECORDS_READ  = 1
11 | local BAD_RECORDS   = 2
12 | 
13 | loggers = {}
14 | 
15 | local function add_channel(logger, channel)
16 |     local c = circular_buffer.new(rows, 2, sec_per_row, true)
17 |     c:set_header(RECORDS_READ   , "Records Read")
18 |     c:set_header(BAD_RECORDS    , "Bad Records")
19 |     logger[channel] = c
20 |     return c
21 | end
22 | 
23 | function process_message ()
24 |     local logger = read_message("Logger")
25 |     local l = loggers[logger]
26 |     if not l then
27 |         l = {}
28 |         loggers[logger] = l
29 |     end
30 | 
31 |     local channel = read_message("Fields[channel]")
32 |     local c = l[channel]
33 |     if not c then
34 |         c = add_channel(l, channel)
35 |     end
36 | 
37 |     local ts = read_message("Timestamp")
38 |     if not c:add(ts, RECORDS_READ, read_message("Fields[records_read]")) then
39 |         return 0 -- outside the buffer
40 |     end
41 | 
42 |     c:add(ts, BAD_RECORDS, read_message("Fields[bad_records]"))
43 | 
44 |     return 0
45 | end
46 | 
47 | function timer_event(ns)
48 |     for k, v in pairs(loggers) do
49 |         for m, n in pairs(v) do
50 |             local title = string.format("%s.%s", k, m)
51 |             inject_message(n:format("cbuf"), title)
52 |             inject_message(n:format("cbufd"), title)
53 |         end
54 |     end
55 | end
56 | 


--------------------------------------------------------------------------------
/monitoring/heka/server.toml:
--------------------------------------------------------------------------------
 1 | [TelemetryServerInput]
 2 | type = "LogstreamerInput"
 3 | log_directory = "/mnt/telemetry/log"
 4 | file_match = 'telemetry-server\.log'
 5 | decoder = "TelemetryServerDecoder"
 6 | 
 7 | [TelemetryServerDecoder]
 8 | type = "SandboxDecoder"
 9 | script_type = "lua"
10 | filename = "lua_decoders/telemetry_server.lua"
11 | 
12 | [TelemetryServerMetrics]
13 | type = "SandboxFilter"
14 | message_matcher = "Type == 'telemetry.server'"
15 | ticker_interval = 60
16 | script_type = "lua"
17 | filename = "lua_filters/telemetry_server_metrics.lua"
18 | preserve_data = true
19 | 
20 | [TelemetryChannelMetrics]
21 | type = "SandboxFilter"
22 | message_matcher = "Type == 'telemetry.server'"
23 | ticker_interval = 60
24 | script_type = "lua"
25 | filename = "lua_filters/telemetry_channel_metrics.lua"
26 | preserve_data = true
27 | 
28 | [TelemetryChannelMetrics60Days]
29 | type = "SandboxFilter"
30 | message_matcher = "Type == 'telemetry.server'"
31 | ticker_interval = 60
32 | script_type = "lua"
33 | filename = "lua_filters/telemetry_channel_metrics.lua"
34 | preserve_data = true
35 | 
36 | [TelemetryChannelMetrics60Days.config]
37 | rows = 1440
38 | sec_per_row = 3600
39 | 


--------------------------------------------------------------------------------
/monitoring/process_incoming/viz/css/metrics-graphics-demo.css:
--------------------------------------------------------------------------------
1 | #long svg .y-axis line,
2 | #fake_users3 svg .y-axis line {
3 |     opacity: 0.1;
4 | }
5 | 
6 | #confidence_band svg .x-axis line {
7 |     opacity: 0.1;
8 | }


--------------------------------------------------------------------------------
/monitoring/sanitize_fxos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/monitoring/sanitize_fxos/__init__.py


--------------------------------------------------------------------------------
/monitoring/telemetry.mozilla.org/check_last_update.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | import argparse
 9 | import sys
10 | from boto.s3.connection import S3Connection
11 | from datetime import datetime, timedelta
12 | 
13 | default_date_format = '%a, %d %b %Y %H:%M:%S %Z'
14 | message_template = "s3://{0}/{1} was modified {2} than {3} hours ago: {4}"
15 | 
16 | def is_older(target, max_hrs, date_format=default_date_format, verbose=False):
17 |     target_date = datetime.strptime(target, date_format)
18 |     now_date = datetime.utcnow()
19 |     delta = timedelta(hours=(-max_hrs))
20 |     cutoff_date = now_date + delta
21 |     if target_date < cutoff_date:
22 |         if verbose:
23 |             print target_date.strftime(date_format), "<", cutoff_date.strftime(date_format)
24 |         return True
25 |     if verbose:
26 |         print target_date.strftime(date_format), ">=", cutoff_date.strftime(date_format)
27 |     return False
28 | 
29 | def get_args(argv):
30 |     parser = argparse.ArgumentParser(description="Check the last_modified timestamp of an object in S3")
31 |     parser.add_argument("-k", "--aws-key", help="AWS Key", default=None)
32 |     parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", default=None)
33 |     parser.add_argument("-b", "--bucket", required=True, help="S3 bucket name")
34 |     parser.add_argument("-p", "--path", required=True, help="S3 object path")
35 |     parser.add_argument("-m", "--max-age", help="Threshold for alerting (in hours, default is 24)", type=int, default=24)
36 |     parser.add_argument("-f", "--date-format", help="Override the default date format", default=default_date_format)
37 |     parser.add_argument("-v", "--verbose", action="store_true", help="Print more detailed output")
38 |     args = parser.parse_args(argv)
39 |     return args
40 | 
41 | def main(argv):
42 |     args = get_args(argv)
43 |     conn = S3Connection(args.aws_key, args.aws_secret_key)
44 |     bucket = conn.get_bucket(args.bucket)
45 |     key = bucket.get_key(args.path)
46 | 
47 |     # File was not modified recently.
48 |     if is_older(key.last_modified, args.max_age, args.date_format, args.verbose):
49 |         print message_template.format(args.bucket, key.name, "more",
50 |                                       args.max_age, key.last_modified)
51 |         return 1
52 | 
53 |     # File was modified recently.
54 |     if args.verbose:
55 |         print message_template.format(args.bucket, key.name, "less",
56 |                                       args.max_age, key.last_modified)
57 |     return 0
58 | 
59 | if __name__ == '__main__':
60 |     sys.exit(main(sys.argv[1:]))
61 | 


--------------------------------------------------------------------------------
/process_incoming/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/process_incoming/__init__.py


--------------------------------------------------------------------------------
/process_incoming/worker/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # This Source Code Form is subject to the terms of the Mozilla Public
2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 | 
5 | add_executable(convert ConvertConfig.cpp convert.cpp)
6 | target_link_libraries(convert telemetry)
7 | 
8 | add_subdirectory(common)
9 | 


--------------------------------------------------------------------------------
/process_incoming/worker/ConvertConfig.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | #ifndef mozilla_telemetry_Convert_Config_h
 8 | #define mozilla_telemetry_Convert_Config_h
 9 | 
10 | #include <boost/filesystem.hpp>
11 | 
12 | namespace mozilla {
13 | namespace telemetry {
14 | 
15 | struct ConvertConfig
16 | {
17 |   std::string             mHekaServer;
18 |   std::string             mHistogramServer;
19 |   boost::filesystem::path mTelemetrySchema;
20 |   boost::filesystem::path mStoragePath;
21 |   boost::filesystem::path mUploadPath;
22 |   uint64_t                mMaxUncompressed;
23 |   size_t                  mMemoryConstraint;
24 |   int                     mCompressionPreset;
25 | };
26 | 
27 | /**
28 |  * Loads the converter configuration from disk.
29 |  *
30 |  * @param aFile Filename containing the JSON configuration.
31 |  * @param aConfig Structure to populate with the configuration.
32 |  */
33 | void ReadConfig(const char* aFile, ConvertConfig& aConfig);
34 | 
35 | }
36 | }
37 | 
38 | #endif // mozilla_telemetry_Convert_Config_h
39 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | set(TELEMETRY_SRC
 6 | TelemetryConstants.cpp
 7 | HistogramSpecification.cpp
 8 | HistogramCache.cpp
 9 | HistogramConverter.cpp
10 | TelemetryRecord.cpp
11 | TelemetrySchema.cpp
12 | RecordWriter.cpp
13 | CompressedFileWriter.cpp
14 | Metric.cpp
15 | message.pb.cc
16 | HekaLogger.cpp)
17 | 
18 | add_library(telemetry STATIC ${TELEMETRY_SRC})
19 | add_dependencies(telemetry rapidjson-0_11)
20 | target_link_libraries(telemetry
21 | ${Boost_LIBRARIES}
22 | ${PROTOBUF_LIBRARIES}
23 | ${ZLIB_LIBRARIES}
24 | ${OPENSSL_LIBRARIES}
25 | ${CMAKE_THREAD_LIBS_INIT}
26 | ${LZMA_LIBRARIES}
27 | -lrt)
28 | 
29 | INCLUDE(CPack)
30 | 
31 | configure_file(TelemetryConstants.in.cpp ${CMAKE_CURRENT_BINARY_DIR}/TelemetryConstants.cpp)
32 | 
33 | add_subdirectory(test)
34 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/Common.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | #ifndef mozilla_common_h
 8 | #define mozilla_common_h
 9 | 
10 | #include <rapidjson/document.h>
11 | 
12 | typedef rapidjson::GenericDocument<rapidjson::UTF8<>, rapidjson::CrtAllocator> RapidjsonDocument;
13 | typedef rapidjson::GenericValue<rapidjson::UTF8<>, rapidjson::CrtAllocator>  RapidjsonValue;
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/CompressedFileWriter.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | #ifndef CompressedFileWriter_h
 8 | #define CompressedFileWriter_h
 9 | 
10 | #include <stdio.h>
11 | #include <stdint.h>
12 | #include <string>
13 | 
14 | #include <lzma.h>
15 | 
16 | /** Buffer output buffer size, before writing to file */
17 | #define BUF_SIZE BUFSIZ
18 | 
19 | namespace mozilla {
20 | namespace telemetry {
21 | 
22 | /**
23 |  * Compressed Wrapper class that writes data to an compressed XZ file
24 |  * This is essentially LZMA2, XZ docs says not use LZMA1 unless you know what
25 |  * you're doing.
26 |  */
27 | class CompressedFileWriter
28 | {
29 | public:
30 |   /** Create CompressedFileWriter */
31 |   CompressedFileWriter();
32 | 
33 |   /**
34 |    * Initialize CompressedFileWriter given an LZMA compression level, a number
35 |    * between 0 and 9.
36 |    * See preset option in xz(1) for more details.
37 |    */
38 |   bool Initialize(FILE *aFile, uint32_t aPreset = 0);
39 | 
40 |   /** Write buffer to compressed file */
41 |   bool Write(const char* aBuffer, size_t aSize, size_t *aCompressedSize = nullptr);
42 | 
43 |   /** Finalize compression */
44 |   bool Finalize(size_t *aCompressedSize = nullptr);
45 | 
46 |   ~CompressedFileWriter();
47 | private:
48 |   FILE* mFile;
49 |   lzma_stream mStream;
50 |   char mBuffer[BUF_SIZE];
51 | };
52 | 
53 | } // namespace telemetry
54 | } // namespace mozilla
55 | 
56 | #endif // CompressedFileWriter_h
57 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/HekaLogger.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | /// @brief Heka logger implementation @file
 8 | 
 9 | #include "HekaLogger.h"
10 | 
11 | #include <boost/asio.hpp>
12 | 
13 | using boost::asio::ip::tcp;
14 | 
15 | namespace mozilla {
16 | namespace telemetry {
17 | 
18 | ////////////////////////////////////////////////////////////////////////////////
19 | HekaLogger::HekaLogger() : mSocket(mIo) { }
20 | 
21 | ////////////////////////////////////////////////////////////////////////////////
22 | bool HekaLogger::Connect(const std::string& aHeka)
23 | {
24 |   if (mSocket.is_open()) {
25 |     mSocket.close();
26 |   }
27 | 
28 |   size_t pos = aHeka.find(':');
29 |   std::string host = aHeka.substr(0, pos);
30 |   std::string port;
31 |   if (pos != std::string::npos) {
32 |     port = aHeka.substr(pos + 1);
33 |   } else {
34 |     port = "5565";
35 |   }
36 |   try {
37 |     boost::asio::ip::tcp::resolver resolver(mIo);
38 |     boost::asio::ip::tcp::resolver::query query(host, port);
39 |     boost::asio::ip::tcp::resolver::iterator end,  i = resolver.resolve(query);
40 |     if (end == boost::asio::connect(mSocket, i)) {
41 |       return false;
42 |     }
43 |   }
44 |   catch (...) {
45 |     return false;
46 |   }
47 |   return true;
48 | }
49 | 
50 | ////////////////////////////////////////////////////////////////////////////////
51 | void HekaLogger::Disconnect()
52 | {
53 |   mSocket.close();
54 | }
55 | 
56 | ////////////////////////////////////////////////////////////////////////////////
57 | bool HekaLogger::Write(boost::asio::streambuf& sb)
58 | {
59 |   if (!mSocket.is_open()) {
60 |    return false;
61 |   }
62 | 
63 |   try {
64 |     write(mSocket, sb);
65 |   }
66 |   catch (...) {
67 |     return false;
68 |   }
69 |   return true;
70 | }
71 | 
72 | }
73 | }
74 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/HekaLogger.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | /** @file Writes log message to Heka via TCP */
 8 | 
 9 | #ifndef mozilla_telemetry_Heka_Logger_h
10 | #define mozilla_telemetry_Heka_Logger_h
11 | 
12 | #include <boost/asio.hpp>
13 | #include <string>
14 | 
15 | namespace mozilla {
16 | namespace telemetry {
17 | 
18 | class HekaLogger
19 | {
20 | public:
21 |   HekaLogger();
22 | 
23 |   /**
24 |    * Connects the logger to a Heka instance.
25 |    *
26 |    * @param aHeka Hostname:port
27 |    *
28 |    * @return bool True if a connection could be established.
29 |    */
30 |   bool Connect(const std::string& aHeka);
31 | 
32 |   /**
33 |    * Closes the connect to the Heka server.
34 |    */
35 |   void Disconnect();
36 | 
37 |   /**
38 |    * Writes the data to the Heka server.
39 |    *
40 |    * @param sb Stream buffer containing the data to output.
41 |    *
42 |    * @return bool True if the data was successfully written to Heka.
43 |    */
44 |   bool Write(boost::asio::streambuf& sb);
45 | 
46 |   bool operator()()
47 |   {
48 |     return mSocket.is_open();
49 |   }
50 | 
51 | private:
52 |   boost::asio::io_service mIo;
53 |   boost::asio::ip::tcp::socket mSocket;
54 | };
55 | 
56 | }
57 | }
58 | 
59 | #endif // mozilla_telemetry_Heka_Logger_h
60 | 
61 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/HistogramConverter.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | /// Histogram data converter. @file
 8 | 
 9 | #ifndef mozilla_telemetry_Histogram_Converter_h
10 | #define mozilla_telemetry_Histogram_Converter_h
11 | 
12 | #include "HistogramCache.h"
13 | 
14 | #include <rapidjson/document.h>
15 | 
16 | namespace mozilla {
17 | namespace telemetry {
18 | 
19 | bool ConvertHistogramData(HistogramCache& aCache, RapidjsonDocument& aDoc);
20 | 
21 | }
22 | }
23 | 
24 | #endif // mozilla_telemetry_Histogram_Converter_h
25 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/Logger.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | #ifndef mozilla_telemetry_logger_h
 8 | #define mozilla_telemetry_logger_h
 9 | 
10 | #include <cstring>
11 | #include <boost/log/trivial.hpp>
12 | 
13 | #define __SHORT_FORM_OF_FILE__ \
14 |   (strrchr(__FILE__,'/') \
15 |    ? strrchr(__FILE__,'/')+1 \
16 |    : __FILE__ \
17 |    )
18 | 
19 | #define LOGGER(level) BOOST_LOG_TRIVIAL(level) << __FUNCTION__ << " @ " << __SHORT_FORM_OF_FILE__ << ":" << __LINE__ << " - "
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/Metric.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | /// @brief Metric implementation @file
 8 | 
 9 | #include "Metric.h"
10 | #include "TelemetryConstants.h"
11 | 
12 | #include <iostream>
13 | 
14 | namespace mozilla {
15 | namespace telemetry {
16 | 
17 | ////////////////////////////////////////////////////////////////////////////////
18 | void
19 | ConstructField(message::Message& aMsg, Metric& aMetric)
20 | {
21 |   auto f = aMsg.add_fields();
22 |   f->set_name(aMetric.mName);
23 |   f->set_representation(aMetric.mRepresentation);
24 |   f->set_value_type(message::Field_ValueType_DOUBLE);
25 |   f->add_value_double(aMetric.mValue);
26 | }
27 | 
28 | ////////////////////////////////////////////////////////////////////////////////
29 | void
30 | WriteMessage(std::ostream& os, message::Message& aMsg)
31 | {
32 |   if (!os) return;
33 | 
34 |   message::Header h;
35 |   h.set_message_length(aMsg.ByteSize());
36 |   os.put(kRecordSeparator);
37 |   os.put(h.ByteSize());
38 |   h.SerializeToOstream(&os);
39 |   os.put(kUnitSeparator);
40 |   aMsg.SerializeToOstream(&os);
41 | }
42 | 
43 | }
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/Metric.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | /** @file
 8 | Generic structure for tracking runtime statistics.
 9 | */
10 | 
11 | #ifndef mozilla_telemetry_Metric_h
12 | #define mozilla_telemetry_Metric_h
13 | 
14 | #include "message.pb.h"
15 | 
16 | #include <ostream>
17 | 
18 | namespace mozilla {
19 | namespace telemetry {
20 | 
21 | struct Metric
22 | {
23 |   Metric(std::string aName, std::string aRepresentation = "count") :
24 |     mName(aName),
25 |     mRepresentation(aRepresentation),
26 |     mValue(0) { }
27 | 
28 |   std::string mName;
29 |   std::string mRepresentation;
30 |   double      mValue;
31 | };
32 | 
33 | /**
34 |  * Helper function to ture a Metric struct into a Heka message field.
35 |  *
36 |  * @param aMsg Heka protobuf message to add the field to.
37 |  * @param aMetric Metric to be converted to a field.
38 |  */
39 | void ConstructField(message::Message &aMsg, Metric& aMetric);
40 | 
41 | /**
42 |  * Writes a Heka protobuf message with proper framing for stream output.
43 |  *
44 |  * @param os Output stream receiving the message.
45 |  * @param aMsg Message to be framed, encoded, and written.
46 |  */
47 | void WriteMessage(std::ostream &os, message::Message &aMsg);
48 | 
49 | }
50 | }
51 | 
52 | #endif // mozilla_telemetry_Metric_h
53 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/TelemetryConstants.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | /// Mozilla Telemetry global constants @file
 8 | 
 9 | #ifndef mozilla_telemetry_Telemetry_Constants_h
10 | #define mozilla_telemetry_Telemetry_Constants_h
11 | 
12 | #include <string>
13 | 
14 | namespace mozilla {
15 | namespace telemetry {
16 | extern const unsigned kVersionMajor;
17 | extern const unsigned kVersionMinor;
18 | extern const unsigned kVersionPatch;
19 | 
20 | extern const std::string kProgramName;
21 | extern const std::string kProgramDescription;
22 | 
23 | extern const size_t kMaxTelemetryPath;
24 | extern const size_t kMaxTelemetryData;
25 | 
26 | extern const char kRecordSeparator;
27 | extern const char kUnitSeparator;
28 | 
29 | extern const size_t kExtraBucketsSize;
30 | extern const char* kExtraBuckets[];
31 | }
32 | }
33 | 
34 | #endif // mozilla_telemetry_Telemetry_Constants_h
35 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/TelemetryConstants.in.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | /// @brief Prevent duplication of string constants between compilation units @file
 8 | 
 9 | #include "@CMAKE_CURRENT_SOURCE_DIR@/TelemetryConstants.h"
10 | 
11 | namespace mozilla {
12 | namespace telemetry {
13 | 
14 | const unsigned kVersionMajor = @CPACK_PACKAGE_VERSION_MAJOR@;
15 | const unsigned kVersionMinor = @CPACK_PACKAGE_VERSION_MINOR@;
16 | const unsigned kVersionPatch = @CPACK_PACKAGE_VERSION_PATCH@;
17 | 
18 | const std::string kProgramName("@PROJECT_NAME@");
19 | const std::string kProgramDescription("@CPACK_PACKAGE_DESCRIPTION_SUMMARY@");
20 | 
21 | const size_t kMaxTelemetryPath = 10 * 1024;
22 | const size_t kMaxTelemetryData = 200 * 1024;
23 | 
24 | const char kRecordSeparator = 0x1e;
25 | const char kUnitSeparator = 0x1f;
26 | 
27 | const size_t kExtraBucketsSize = 5;
28 | const char* kExtraBuckets[] = { "sum", "log_sum", "log_sum_squares",
29 |   "sum_squares_lo", "sum_squares_hi", nullptr };
30 | 
31 | }
32 | }
33 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/TelemetryRecord.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | /// Telemetry record. @file
 8 | 
 9 | #ifndef mozilla_telemetry_Telemetry_Record_h
10 | #define mozilla_telemetry_Telemetry_Record_h
11 | 
12 | #include "Common.h"
13 | #include "Metric.h"
14 | #include "TelemetryConstants.h"
15 | 
16 | #include <istream>
17 | #include <boost/utility.hpp>
18 | #include <cstdint>
19 | #include <rapidjson/document.h>
20 | 
21 | namespace mozilla {
22 | namespace telemetry {
23 | 
24 | template<class T>
25 | std::istream& read_value(std::istream& aInput, T& val)
26 | {
27 |   aInput.read((char*)&val, sizeof(val));
28 |   return aInput;
29 | }
30 | 
31 | class TelemetryRecord : boost::noncopyable
32 | {
33 | public:
34 |   TelemetryRecord();
35 |   ~TelemetryRecord();
36 | 
37 |   bool Read(std::istream& aInput);
38 | 
39 |   const char* GetPath();
40 |   uint64_t GetTimestamp();
41 |   RapidjsonDocument& GetDocument();
42 | 
43 |   /**
44 |    * Rolls up the internal metric data into the fields element of the provided
45 |    * message. The metrics are reset after each call.
46 |    *
47 |    * @param aMsg The message fields element will be cleared and then populated
48 |    *             with the TelemetryRecord metrics.
49 |    */
50 |   void GetMetrics(message::Message& aMsg);
51 | 
52 | private:
53 |   struct Metrics {
54 |     Metrics() :
55 |       mInvalidPathLength("Invalid Path Length"),
56 |       mInvalidDataLength("Invalid Data Length"),
57 |       mInflateFailures("Inflate Failures"),
58 |       mParseFailures("Parse Failures"),
59 |       mCorruptData("Corrupt Data", "B")  { }
60 | 
61 |     Metric mInvalidPathLength;
62 |     Metric mInvalidDataLength;
63 |     Metric mInflateFailures;
64 |     Metric mParseFailures;
65 |     Metric mCorruptData;
66 |   };
67 | 
68 |   bool FindRecord(std::istream& aInput);
69 |   bool ReadHeader(std::istream& aInput);
70 |   bool ProcessRecord();
71 |   int Inflate();
72 | 
73 |   RapidjsonDocument mDocument;
74 | 
75 |   uint16_t  mPathLength;
76 |   size_t    mPathSize;
77 |   char*     mPath;
78 | 
79 |   uint32_t  mDataLength;
80 |   size_t    mDataSize;
81 |   char*     mData;
82 | 
83 |   uint64_t  mTimestamp;
84 | 
85 |   uint32_t  mInflateLength;
86 |   size_t    mInflateSize;
87 |   char*     mInflate;
88 | 
89 |   Metrics   mMetrics;
90 | 
91 | };
92 | 
93 | }
94 | }
95 | 
96 | #endif // mozilla_telemetry_Telemetry_Record_h
97 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/message.proto:
--------------------------------------------------------------------------------
 1 | package message;
 2 | 
 3 | message Header {
 4 |   enum HmacHashFunction {
 5 |     MD5  = 0;
 6 |     SHA1 = 1;
 7 |   }
 8 |   required uint32           message_length      = 1; // length in bytes
 9 | 
10 |   optional HmacHashFunction hmac_hash_function  = 3 [default = MD5];
11 |   optional string           hmac_signer         = 4;
12 |   optional uint32           hmac_key_version    = 5;
13 |   optional bytes            hmac                = 6;
14 | }
15 | 
16 | message Field {
17 |   enum ValueType {
18 |     STRING  = 0;
19 |     BYTES   = 1;
20 |     INTEGER = 2;
21 |     DOUBLE  = 3;
22 |     BOOL    = 4;
23 |   }
24 |   required string       name            = 1;
25 |   optional ValueType    value_type      = 2 [default = STRING];
26 |   optional string       representation  = 3;
27 |   repeated string       value_string    = 4;
28 |   repeated bytes        value_bytes     = 5;
29 |   repeated int64        value_integer   = 6 [packed=true];
30 |   repeated double       value_double    = 7 [packed=true];
31 |   repeated bool         value_bool      = 8 [packed=true];
32 | }
33 |   
34 | message Message {
35 |   required bytes    uuid        = 1;
36 |   required int64    timestamp   = 2; // nanoseconds since UNIX epoch
37 |   optional string   type        = 3;
38 |   optional string   logger      = 4;
39 |   optional int32    severity    = 5 [default = 7];
40 |   optional string   payload     = 6;
41 |   optional string   env_version = 7;
42 |   optional int32    pid         = 8;
43 |   optional string   hostname    = 9;
44 |   repeated Field    fields      = 10;
45 | }
46 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | add_executable(TestHistogramSpecification TestHistogramSpecification.cpp)
 6 | target_link_libraries(TestHistogramSpecification telemetry ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
 7 | ADD_TEST(TestHistogramSpecification TestHistogramSpecification)
 8 | 
 9 | add_executable(TestHistogramCache TestHistogramCache.cpp)
10 | target_link_libraries(TestHistogramCache telemetry ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
11 | ADD_TEST(TestHistogramCache TestHistogramCache)
12 | 
13 | add_executable(TestHistogramConverter TestHistogramConverter.cpp)
14 | target_link_libraries(TestHistogramConverter telemetry ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
15 | ADD_TEST(TestHistogramConverter TestHistogramConverter)
16 | 
17 | add_executable(TestTelemetryRecord TestTelemetryRecord.cpp)
18 | target_link_libraries(TestTelemetryRecord telemetry ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
19 | ADD_TEST(TestTelemetryRecord TestTelemetryRecord)
20 | 
21 | add_executable(TestTelemetrySchema TestTelemetrySchema.cpp)
22 | target_link_libraries(TestTelemetrySchema telemetry ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
23 | ADD_TEST(TestTelemetrySchema TestTelemetrySchema)
24 | 
25 | add_executable(TestRecordWriter TestRecordWriter.cpp)
26 | target_link_libraries(TestRecordWriter telemetry ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
27 | ADD_TEST(TestRecordWriter TestRecordWriter)
28 | 
29 | configure_file (${CMAKE_CURRENT_SOURCE_DIR}/TestConfig.in.h ${CMAKE_CURRENT_BINARY_DIR}/TestConfig.h)
30 | include_directories(${CMAKE_CURRENT_BINARY_DIR})
31 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/test/TestConfig.in.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | #ifndef mozilla_telemetry_Test_Config_h
 8 | #define mozilla_telemetry_Test_Config_h
 9 | #include <string>
10 | 
11 | const std::string kDataPath("${CMAKE_CURRENT_SOURCE_DIR}/data/");
12 | 
13 | #endif // mozilla_telemetry_Test_Config_h
14 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/test/TestHistogramCache.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | #define BOOST_TEST_MODULE TestHistogramCache
 8 | #include <boost/test/unit_test.hpp>
 9 | #include "TestConfig.h"
10 | #include "../HistogramCache.h"
11 | 
12 | using namespace std;
13 | using namespace mozilla::telemetry;
14 | 
15 | BOOST_AUTO_TEST_CASE(test_valid)
16 | {
17 |   HistogramCache cache("localhost:9898");
18 |   auto h = cache.FindHistogram("https://hg.mozilla.org/releases/mozilla-release/rev/a55c55edf302");
19 |   BOOST_REQUIRE(h);
20 | }
21 | 
22 | BOOST_AUTO_TEST_CASE(test_unknown_revision)
23 | {
24 |   HistogramCache cache("localhost:9898");
25 |   auto h = cache.FindHistogram("https://hg.mozilla.org/releases/mozilla-release/rev/f55c55edf302");
26 |   BOOST_REQUIRE(!h);
27 | }
28 | 
29 | BOOST_AUTO_TEST_CASE(test_invalid_revision)
30 | {
31 |   HistogramCache cache("localhost:9898");
32 |   auto h = cache.FindHistogram("missing");
33 |   BOOST_REQUIRE(!h);
34 | }
35 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/test/TestHistogramConverter.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | #define BOOST_TEST_MODULE TestHistogramConverter
 8 | #include <boost/test/unit_test.hpp>
 9 | #include "TestConfig.h"
10 | #include "../HistogramConverter.h"
11 | 
12 | #include <rapidjson/document.h>
13 | #include <rapidjson/stringbuffer.h>
14 | #include <rapidjson/writer.h>
15 | 
16 | using namespace std;
17 | using namespace mozilla::telemetry;
18 | 
19 | BOOST_AUTO_TEST_CASE(test_converter)
20 | {
21 |   const char* hist = "{\"ver\":1,\"histograms\":{\"A11Y_IATABLE_USAGE_FLAG\":{\"range\":[1,2],\"bucket_count\":3,\"histogram_type\":3,\"values\":{\"0\":1,\"1\":0},\"sum\":4984161763,\"sum_squares_lo\":1.23415,\"sum_squares_hi\":1.01}},\"info\":{\"revision\":\"https://hg.mozilla.org/releases/mozilla-release/rev/a55c55edf302\"}}";
22 | 
23 |   const char* conv = "{\"ver\":2,\"histograms\":{\"A11Y_IATABLE_USAGE_FLAG\":[1,0,0,4984161763,-1,-1,1.23415,1.01]},\"info\":{\"revision\":\"https://hg.mozilla.org/releases/mozilla-release/rev/a55c55edf302\"}}";
24 | 
25 |   RapidjsonDocument d;
26 |   d.Parse<0>(hist);
27 |   BOOST_REQUIRE(!d.HasParseError());
28 | 
29 |   HistogramCache cache("localhost:9898");
30 |   BOOST_REQUIRE_EQUAL(true, ConvertHistogramData(cache, d));
31 |   rapidjson::StringBuffer sb;
32 |   rapidjson::Writer<rapidjson::StringBuffer> writer(sb);
33 |   d.Accept(writer);
34 |   BOOST_REQUIRE_EQUAL(conv, sb.GetString());
35 | }
36 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/test/TestRecordWriter.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | #define BOOST_TEST_MODULE TestRecordWriter
 8 | 
 9 | #include "TestConfig.h"
10 | #include "../RecordWriter.h"
11 | 
12 | #include <stdlib.h>
13 | #include <fstream>
14 | 
15 | #include <boost/test/unit_test.hpp>
16 | #include <boost/regex.hpp>
17 | 
18 | using namespace std;
19 | using namespace mozilla::telemetry;
20 | 
21 | namespace fs = boost::filesystem;
22 | 
23 | BOOST_AUTO_TEST_CASE(test_converter)
24 | {
25 |   fs::path workDir = ".work";
26 |   fs::path uploadDir = ".upload";
27 | 
28 |   BOOST_REQUIRE(!fs::exists(workDir));
29 |   BOOST_REQUIRE(!fs::exists(uploadDir));
30 |   fs::create_directory(workDir);
31 |   fs::create_directory(uploadDir);
32 | 
33 |   RecordWriter writer(workDir.string(), uploadDir.string(), 1048576, 1000, 0);
34 |   string payload = "431ab5c3-2712-4ab7-a4b6-e9b61f3a1f30	{\"ver\":2,\"histograms\":{\"A11Y_IATABLE_USAGE_FLAG\":[1,0,0,0,-1,-1,1.23415,1.01]},\"info\":{\"revision\":\"https://hg.mozilla.org/releases/mozilla-release/rev/a55c55edf302\"}}";
35 |   string prefix = "output";
36 |   writer.Write(prefix, payload.c_str(), payload.size() + 1);
37 |   writer.Finalize();
38 |   BOOST_REQUIRE(fs::is_empty(workDir));
39 |   BOOST_REQUIRE(!fs::is_empty(uploadDir));
40 | 
41 |   fs::directory_iterator it(uploadDir);
42 |   fs::path generated = it->path();
43 | 
44 |   string filename = generated.leaf().string();
45 |   boost::regex reg(prefix + "\\.v2\\.log\\.[0-9a-f]{32}\\.xz");
46 |   BOOST_REQUIRE(regex_match(filename.begin(), filename.end(), reg));
47 | 
48 |   string command = "xz -d " + generated.string();
49 |   BOOST_REQUIRE(system(command.c_str()) == 0);
50 | 
51 |   fs::path decompressed = generated.replace_extension();
52 |   ifstream decompressedFile(decompressed.string());
53 |   string line;
54 |   BOOST_REQUIRE(getline(decompressedFile, line, '\0'));
55 |   BOOST_REQUIRE_EQUAL(line, payload);
56 | 
57 |   fs::remove_all(workDir);
58 |   fs::remove_all(uploadDir);
59 | }
60 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/test/TestTelemetryRecord.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | #define BOOST_TEST_MODULE TestTelemetryRecord
 8 | #include <boost/test/unit_test.hpp>
 9 | #include "TestConfig.h"
10 | #include "../TelemetryRecord.h"
11 | 
12 | #include <string>
13 | #include <fstream>
14 | #include <sstream>
15 | 
16 | #include <rapidjson/writer.h>
17 | 
18 | #include <iostream>
19 | 
20 | using namespace std;
21 | using namespace mozilla::telemetry;
22 | 
23 | static const string rec("\x1e\x04\x00\x07\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00" "abcd{\"a\":8}", 26);
24 | 
25 | BOOST_AUTO_TEST_CASE(test_read)
26 | {
27 |   string data(rec + rec);
28 |   istringstream iss(data);
29 |   TelemetryRecord tr;
30 |   for (int i = 0; i < 2; ++i) {
31 |     BOOST_REQUIRE_EQUAL(true, tr.Read(iss));
32 |     BOOST_REQUIRE_EQUAL(1, tr.GetTimestamp());
33 |     BOOST_REQUIRE_EQUAL("abcd", tr.GetPath());
34 |     BOOST_REQUIRE_EQUAL(8, tr.GetDocument()["a"].GetInt());
35 |   }
36 | }
37 | 
38 | BOOST_AUTO_TEST_CASE(test_exceed_pathlength)
39 | {
40 |   string data(rec + string("\x1e\xff\xff\x07\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00", 15) + rec);
41 |   istringstream iss(data);
42 |   TelemetryRecord tr;
43 |   for (int i = 0; i < 2; ++i) {
44 |     BOOST_REQUIRE_EQUAL(true, tr.Read(iss));
45 |     BOOST_REQUIRE_EQUAL(1, tr.GetTimestamp());
46 |     BOOST_REQUIRE_EQUAL("abcd", tr.GetPath());
47 |     BOOST_REQUIRE_EQUAL(8, tr.GetDocument()["a"].GetInt());
48 |   }
49 | }
50 | 
51 | BOOST_AUTO_TEST_CASE(test_short_pathlength)
52 | {
53 |   string bad_rec("\x1e\x02\x00\x07\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00" "abcd{\"a\":8}", 26);
54 |   string data(bad_rec + rec);
55 |   istringstream iss(data);
56 |   TelemetryRecord tr;
57 | 
58 |   BOOST_REQUIRE_EQUAL(true, tr.Read(iss));
59 |   BOOST_REQUIRE_EQUAL(1, tr.GetTimestamp());
60 |   BOOST_REQUIRE_EQUAL("abcd", tr.GetPath());
61 |   BOOST_REQUIRE_EQUAL(8, tr.GetDocument()["a"].GetInt());
62 | 
63 |   BOOST_REQUIRE_EQUAL(false, tr.Read(iss));
64 | }
65 | 
66 | //BOOST_AUTO_TEST_CASE(test_large_file)
67 | //{
68 | //  ifstream file(kDataPath + "../../../../telemetry.log", ios_base::binary);
69 | //  TelemetryRecord tr;
70 | //  int cnt = 0;
71 | //  while (tr.Read(file)) {
72 | //    ++cnt;
73 | //  }
74 | //  BOOST_REQUIRE_EQUAL(7331, cnt);
75 | //}
76 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/test/data/invalid.json:
--------------------------------------------------------------------------------
1 | invalid json data
2 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/test/data/invalid_kind.json:
--------------------------------------------------------------------------------
1 | {"histograms":{"MY_HISTOGRAM": { "kind":"bogus"}}}
2 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/test/data/invalid_schema.json:
--------------------------------------------------------------------------------
1 | ["a"]
2 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/test/data/missing_kind.json:
--------------------------------------------------------------------------------
1 | {"histograms":{"MY_HISTOGRAM":{"test":"1"}}}
2 | 


--------------------------------------------------------------------------------
/process_incoming/worker/common/test/data/telemetry1.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/process_incoming/worker/common/test/data/telemetry1.log


--------------------------------------------------------------------------------
/process_incoming/worker/common/test/data/telemetry_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "submission_date",
 6 |       "allowed_values": "*"
 7 |     },
 8 |     {
 9 |       "field_name": "reason",
10 |       "allowed_values": ["idle-daily","saved-session"]
11 |     },
12 |     {
13 |       "field_name": "appName",
14 |       "allowed_values": ["Firefox","Fennec","Thunderbird"]
15 |     },
16 |     {
17 |       "field_name": "appUpdateChannel",
18 |       "allowed_values": ["release", "beta", "nightly", "aurora"]
19 |     },
20 |     {
21 |       "field_name": "appVersion",
22 |       "allowed_values": "*"
23 |     },
24 |     {
25 |       "field_name": "appBuildID",
26 |       "allowed_values": "*"
27 |     },
28 |     {
29 |       "field_name": "memsize",
30 |       "allowed_values": {"min":448, "max":500}
31 |     }
32 |   ]
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/process_incoming/worker/convert.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "heka_server": "localhost:5565",
 3 |     "telemetry_schema": "telemetry_schema.json",
 4 |     "histogram_server": "localhost:9898",
 5 |     "storage_path": "./storage",
 6 |     "upload_path": "./upload",
 7 |     "max_uncompressed": 1048576,
 8 |     "memory_constraint": 1000,
 9 |     "compression_preset": 0
10 | }
11 | 


--------------------------------------------------------------------------------
/provisioning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/provisioning/__init__.py


--------------------------------------------------------------------------------
/provisioning/ansible/README.md:
--------------------------------------------------------------------------------
 1 | Deploying telemetry-analysis
 2 | ============================
 3 | 
 4 | ## Manual setup tasks: 
 5 | 
 6 | - In the [AWS SES console](https://us-west-2.console.aws.amazon.com/ses/home?region=us-west-2), make sure that the email address "telemetry-alerts@mozilla.com" is verified.
 7 | - Make sure cross IAM S3 permissions are set up if cross-IAM access is required. Edit bucket policies for relevant buckets to look something like this:
 8 | ```json
 9 | {
10 |     "Version": "2008-10-17",
11 |     "Statement": [
12 |         {
13 |             "Sid": "ListAccess",
14 |             "Effect": "Allow",
15 |             "Principal": {
16 |                 "AWS": [
17 |                     "arn:aws:iam::XXXXXXXXXXXX:root"
18 |                 ]
19 |             },
20 |             "Action": "S3:ListBucket",
21 |             "Resource": "arn:aws:s3:::telemetry-published-v2"
22 |         },
23 |         {
24 |             "Sid": "GetAccess",
25 |             "Effect": "Allow",
26 |             "Principal": {
27 |                 "AWS": [
28 |                     "arn:aws:iam::XXXXXXXXXXXX:root"
29 |                 ]
30 |             },
31 |             "Action": "S3:GetObject",
32 |             "Resource": "arn:aws:s3:::telemetry-published-v2/*"
33 |         }
34 |     ]
35 | }
36 | ```
37 | 
38 | ## Automated deployment tasks:
39 | 
40 | - Build an AMI for telemetry workers:
41 | ```bash
42 | ansible-playbook -i hosts -v --extra-vars "@envs/dev.yml" playbooks/build_ami.yml
43 | ```
44 | - Set `worker_ami_id` in [`envs/dev.yml`](envs/dev.yml) to the value output by the above command. This a git-managed file.
45 | - Set the RDS password in `envs/dev_secrets.yml`. See [`envs/dev_secrets.example.yml`](envs/dev_secrets.example.yml) for an example. This is an un-managed file. If the telemetry-analysis resources stack has already been created, the value you should set this to is the password portion of the URL.
46 | - Create the static resources Cloudformation template (only needs to be run once):
47 | ```bash
48 | ansible-playbook -i hosts -v --extra-vars "@envs/dev.yml" --extra-vars "@envs/dev_secrets.yml" playbooks/resources.yml
49 | ```
50 | 
51 | ## To update / deploy the application servers:
52 | 
53 | - Create a new code package to use by updating `sources_version` in [`envs/dev.yml`](envs/dev.yml) and running:
54 | ```bash
55 | ansible-playbook -i hosts -v --extra-vars "@envs/dev.yml" playbooks/make_code_package.yml
56 | ```
57 | - Deploy the CloudFormation template by running:
58 | ```bash
59 | ansible-playbook -i hosts -v --extra-vars "@envs/dev.yml" playbooks/app.yml
60 | ```
61 | - Deploy user-facing DNS with (only needs to be run once):
62 | ```bash
63 | ansible-playbook -i hosts -v --extra-vars "@envs/dev.yml" playbooks/route53.yaml
64 | ```
65 | 


--------------------------------------------------------------------------------
/provisioning/ansible/envs/dev.yml:
--------------------------------------------------------------------------------
 1 | stack_name: telemetry-analysis
 2 | region: us-west-2
 3 | env: dev
 4 | key_name: "20151209-cloudservices-aws-ssh-dev"
 5 | ssl_cert_arn: arn:aws:iam::927034868273:server-certificate/exp20180829_star_telemetry_mozilla_org
 6 | 
 7 | # code version
 8 | sources_version: 31
 9 | 
10 | dns_name: "telemetry-analysis.dev.mozaws.net"
11 | dns_zone_name: "dev.mozaws.net."
12 | public_analysis_dns_name: "telemetry-analysis-output.dev.mozaws.net"
13 | 
14 | instance_type: t2.medium
15 | server_ami_id: ami-a40bea97
16 | 
17 | # this value can be updated using the build_ami playbook
18 | worker_ami_id: ami-db8067bb
19 | 
20 | spark_instance_profile: telemetry-spark-cloudformation-TelemetrySparkInstanceProfile-1SATUBVEXG7E3
21 | spark_emr_bucket: telemetry-spark-emr-2
22 | 


--------------------------------------------------------------------------------
/provisioning/ansible/envs/dev_secrets.example.yml:
--------------------------------------------------------------------------------
1 | # This value should be populated from the output of
2 | # the telemetry-analysis-resources if it already exists.
3 | database_password: 'but does it achieve the scale of the web?'
4 | server_secret: 'yes'
5 | 


--------------------------------------------------------------------------------
/provisioning/ansible/hosts:
--------------------------------------------------------------------------------
1 | localhost
2 | 


--------------------------------------------------------------------------------
/provisioning/ansible/playbooks/app.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - include: resources.yml
 3 | 
 4 | - hosts: localhost
 5 |   connection: local
 6 |   tasks:
 7 | 
 8 |     - name: get top level git dir
 9 |       command: 'git rev-parse --show-toplevel'
10 |       register: top_level_dir
11 | 
12 |     - name: make resources template
13 |       command: make analysis-service-stack.json
14 |       args:
15 |         chdir: "{{top_level_dir.stdout}}/http/analysis-service"
16 | 
17 |     - name: resources
18 |       cloudformation: stack_name="{{stack_name}}-app" region={{region}} state=present
19 |       args:
20 |         template: "{{top_level_dir.stdout}}/http/analysis-service/analysis-service-stack.json"
21 |         tags:
22 |           App: "telemetry"
23 |           Env: "{{env}}"
24 |           Stack: "{{stack_name}}"
25 |         template_parameters:
26 |           ServerInstanceType: "{{instance_type}}"
27 |           AnalysisSourcesBucket: "{{resources_cfn.stack_outputs.AnalysisSourcesBucket}}"
28 |           AnalysisPrivateDataBucket: "{{resources_cfn.stack_outputs.AnalysisPrivateDataBucket}}"
29 |           AnalysisDBSecurityGroup: "{{resources_cfn.stack_outputs.AnalysisDBSecurityGroup}}"
30 |           AnalysisDatabaseURL: "{{resources_cfn.stack_outputs.AnalysisDatabaseURL}}"
31 |           AnalysisPublicDataBucket: "{{resources_cfn.stack_outputs.AnalysisPublicDataBucket}}"
32 |           AnalysisTemporaryBucket: "{{resources_cfn.stack_outputs.AnalysisTemporaryBucket}}"
33 |           AnalysisLoadBalancer: "{{resources_cfn.stack_outputs.AnalysisLoadBalancer}}"
34 |           ServerSecret: "{{server_secret}}"
35 |           KeyName: "{{key_name}}"
36 |           WorkerAMI: "{{worker_ami_id}}"
37 |           ServerAMI: "{{server_ami_id}}"
38 |           SourcesVersion: "{{sources_version}}"
39 |           AnalysisPublicWorkerProfile: "{{resources_cfn.stack_outputs.AnalysisPublicWorkerProfile}}"
40 |           AnalysisPrivateWorkerProfile: "{{resources_cfn.stack_outputs.AnalysisPrivateWorkerProfile}}"
41 |           SparkEMRBucket: "{{spark_emr_bucket}}"
42 |           SparkInstanceProfile: "{{spark_instance_profile}}"
43 |           # ignore roles
44 |           # AnalysisPublicWorkerRole: "{{resources_cfn.stack_outputs.AnalysisPublicWorkerRole}}"
45 |           # AnalysisPrivateWorkerRole: "{{resources_cfn.stack_outputs.AnalysisPrivateWorkerRole}}"
46 | 


--------------------------------------------------------------------------------
/provisioning/ansible/playbooks/build_ami.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: localhost
 3 |   connection: local
 4 |   gather_facts: false
 5 | 
 6 |   tasks:
 7 |   - name: get top level git dir
 8 |     command: 'git rev-parse --show-toplevel'
 9 |     register: top_level_dir
10 | 
11 |   - name: build telemetry AMI
12 |     command: 'time python -u -m provisioning.aws.create_telemetry_worker_ami provisioning/aws/telemetry_worker.hvm.json'
13 |     args:
14 |       chdir: '{{ top_level_dir.stdout }}'
15 | 


--------------------------------------------------------------------------------
/provisioning/ansible/playbooks/make_code_package.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - include: resources.yml
 3 | 
 4 | - hosts: localhost
 5 |   connection: local
 6 |   gather_facts: false
 7 | 
 8 |   tasks:
 9 |   - name: get top level git dir
10 |     command: 'git rev-parse --show-toplevel'
11 |     register: top_level_dir
12 | 
13 |   - name: build and upload telemetry code
14 |     command: make put SOURCES_BUCKET={{resources_cfn.stack_outputs.AnalysisSourcesBucket}} VERSION={{ sources_version }}
15 |     args:
16 |       chdir: "{{ top_level_dir.stdout }}/http/analysis-service"
17 | 


--------------------------------------------------------------------------------
/provisioning/ansible/playbooks/resources.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: localhost
 3 |   connection: local
 4 |   tasks:
 5 |     - include_vars: ../envs/dev_secrets.yml
 6 | 
 7 |     - name: get top level git dir
 8 |       command: 'git rev-parse --show-toplevel'
 9 |       register: top_level_dir
10 | 
11 |     - name: make resources template
12 |       command: make analysis-resources.json
13 |       args:
14 |         chdir: "{{top_level_dir.stdout}}/http/analysis-service"
15 | 
16 |     - name: create resources CFN
17 |       cloudformation: stack_name="{{stack_name}}-resources" region={{region}} state=present
18 |       args:
19 |         template: "{{top_level_dir.stdout}}/http/analysis-service/analysis-resources.json"
20 |         tags:
21 |           App: "telemetry"
22 |           Env: "{{env}}"
23 |           Stack: "{{stack_name}}"
24 |         template_parameters:
25 |           ELBSSLCertARN: "{{ssl_cert_arn}}"
26 |           AnalysisDatabasePassword : "{{database_password}}"
27 |       register: resources_cfn
28 | 


--------------------------------------------------------------------------------
/provisioning/ansible/playbooks/route53.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - include: resources.yml
 3 | 
 4 | - hosts: localhost
 5 |   connection: local
 6 |   tasks:
 7 |     - name: promote stack
 8 |       cloudformation: stack_name="{{stack_name}}-route53" region={{region}} state=present
 9 |       args:
10 |         template: ../templates/route53.json
11 |         template_parameters:
12 |           DNSName: "{{dns_name}}"
13 |           DNSZoneName: "{{dns_zone_name}}"
14 |           ELBDNSName: "{{resources_cfn.stack_outputs.AnalysisLoadBalancerDNSName}}"
15 |           ELBZoneNameID: "{{resources_cfn.stack_outputs.AnalysisLoadBalancerZoneID}}"
16 |           Environment: "{{env}}"
17 |           Region: "{{region}}"
18 |           AnalysisPublicCDNDomainName: "{{resources_cfn.stack_outputs.AnalysisPublicCDNDomainName}}"
19 |           AnalysisPublicDomainName: "{{public_analysis_dns_name}}"
20 |         tags:
21 |           App: "telemetry"
22 |           Env: "{{env}}"
23 |           Stack: "{{stack_name}}"
24 |       register: promote
25 | 
26 |     - debug: var=promote
27 | 


--------------------------------------------------------------------------------
/provisioning/ansible/templates/route53.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "AWSTemplateFormatVersion": "2010-09-09",
 3 |   "Description": "Telemetry analysis Route53",
 4 |   "Parameters": {
 5 |     "DNSName": {
 6 |       "Type": "String"
 7 |     },
 8 |     "DNSZoneName": {
 9 |       "Default": "dev.mozaws.net.",
10 |       "Type": "String"
11 |     },
12 |     "ELBZoneNameID": {
13 |       "Description": "From app stack.",
14 |       "Type": "String"
15 |     },
16 |     "ELBDNSName": {
17 |       "Description": "From app stack.",
18 |       "Type": "String"
19 |     },
20 |     "Environment": {
21 |       "Description": "Environment",
22 |       "Type": "String",
23 |       "Default": "dev"
24 |     },
25 |     "Region": {
26 |       "Description": "Environment",
27 |       "Type": "String",
28 |       "Default": "us-west-2"
29 |     },
30 |     "AnalysisPublicCDNDomainName": {
31 |       "Type": "String"
32 |     },
33 |     "AnalysisPublicDomainName": {
34 |       "Type": "String"
35 |     }
36 |   },
37 |   "Resources": {
38 |     "R53AliasRecord": {
39 |       "Type": "AWS::Route53::RecordSet",
40 |       "Properties": {
41 |         "HostedZoneName": {
42 |           "Ref": "DNSZoneName"
43 |         },
44 |         "Name": {
45 |           "Ref": "DNSName"
46 |         },
47 |         "Type": "A",
48 |         "Region": {
49 |           "Ref": "Region"
50 |         },
51 |         "AliasTarget": {
52 |           "EvaluateTargetHealth": true,
53 |           "HostedZoneId": {
54 |             "Ref": "ELBZoneNameID"
55 |           },
56 |           "DNSName": {
57 |             "Ref": "ELBDNSName"
58 |           }
59 |         },
60 |         "SetIdentifier": {
61 |           "Ref": "Region"
62 |         }
63 |       }
64 |     },
65 |     "PublicBucketR53Record": {
66 |       "Type": "AWS::Route53::RecordSet",
67 |       "Properties": {
68 |         "HostedZoneName": {
69 |           "Ref": "DNSZoneName"
70 |         },
71 |         "Name": {
72 |           "Ref": "AnalysisPublicDomainName"
73 |         },
74 |         "Type": "CNAME",
75 |         "ResourceRecords": [
76 |           {
77 |             "Ref": "AnalysisPublicCDNDomainName"
78 |           }
79 |         ],
80 |         "TTL": 60
81 |       }
82 |     }
83 |   },
84 |   "Outputs": {
85 |     "Domain": {
86 |       "Value": {
87 |         "Ref": "PublicBucketR53Record"
88 |       }
89 |     }
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/provisioning/aws/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/provisioning/aws/__init__.py


--------------------------------------------------------------------------------
/provisioning/aws/aws_incoming.example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ssl_user": "ubuntu",
 3 |   "ssl_key_name": "mreid",
 4 |   "ssl_key_path": "~/.ssh/aws/mreid.pem",
 5 |   "ssl_retries": 10,
 6 |   "incoming_bucket": "mreid-telemetry-incoming-test",
 7 |   "incoming_batch_size": 4,
 8 |   "publish_bucket": "mreid-telemetry-published-test",
 9 |   "image": "ami-76831f46",
10 |   "skip_conversion": false,
11 |   "loop": true,
12 |   "instance_type": "t1.micro",
13 |   "security_groups": ["telemetry"],
14 |   "region": "us-west-2",
15 |   "placement": "us-west-2c",
16 |   "name": "mreid-telemetry-process-incoming-test",
17 |   "default_tags": {
18 |     "Owner": "mreid",
19 |     "Application": "telemetry-server"
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/provisioning/aws/aws_incoming.prod.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "incoming_bucket": "telemetry-incoming-v2",
 3 |   "incoming_queue": "telemetry-incoming-v1",
 4 |   "incoming_batch_size": 8,
 5 |   "publish_bucket": "telemetry-published-v2",
 6 |   "image": "ami-76831f46",
 7 |   "skip_conversion": false,
 8 |   "loop": true,
 9 |   "instance_type": "c1.xlarge",
10 |   "ssl_key_name": "mreid",
11 |   "ssl_key_path": "~/.ssh/aws/mreid.pem",
12 |   "ssl_retries": 10,
13 |   "ssl_user": "ubuntu",
14 |   "security_groups": ["telemetry"],
15 |   "region": "us-west-2",
16 |   "placement": "us-west-2c",
17 |   "name": "telemetry-process-incoming-v1-1",
18 |   "default_tags": {
19 |     "Owner": "mreid",
20 |     "Application": "telemetry-server"
21 |   },
22 |   "ephemeral_map": {
23 |     "/dev/xvdb": "ephemeral0",
24 |     "/dev/xvdc": "ephemeral1",
25 |     "/dev/xvdd": "ephemeral2",
26 |     "/dev/xvde": "ephemeral3"
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/provisioning/aws/aws_telemetry_server_config.example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ssl_user": "ubuntu",
 3 |   "ssl_key_name": "mreid",
 4 |   "ssl_key_path": "~/.ssh/aws/mreid.pem",
 5 |   "ssl_retries": 10,
 6 |   "base_dir": "/mnt/telemetry",
 7 |   "instance_type": "m1.xlarge",
 8 |   "image": "ami-bf1d8a8f",
 9 |   "security_groups": ["telemetry"],
10 |   "region": "us-west-2",
11 |   "placement": "us-west-2c",
12 |   "shutdown_behavior": "stop",
13 |   "name": "mreid-telemetry-server-test",
14 |   "incoming_bucket": "mreid-telemetry-incoming-test",
15 |   "process_incoming_config": "./aws_incoming.example.json",
16 |   "default_tags": {
17 |     "Owner": "mreid",
18 |     "Application": "telemetry-server"
19 |   },
20 |   "skip_termination": true,
21 |   "ephemeral_map": {
22 |     "/dev/xvdb": "ephemeral0",
23 |     "/dev/xvdc": "ephemeral1",
24 |     "/dev/xvdd": "ephemeral2",
25 |     "/dev/xvde": "ephemeral3"
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/provisioning/aws/aws_telemetry_server_config.prod.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ssl_user": "ubuntu",
 3 |   "ssl_key_name": "mreid",
 4 |   "ssl_key_path": "~/.ssh/aws/mreid.pem",
 5 |   "ssl_retries": 10,
 6 |   "base_dir": "/mnt/telemetry",
 7 |   "instance_type": "m1.small",
 8 |   "image": "ami-ace67f9c",
 9 |   "security_groups": ["telemetry"],
10 |   "region": "us-west-2",
11 |   "placement": "us-west-2c",
12 |   "shutdown_behavior": "stop",
13 |   "name": "telemetry-server-v1-primary-1",
14 |   "incoming_bucket": "telemetry-incoming-v2",
15 |   "incoming_queue": "telemetry-incoming-v1",
16 |   "process_incoming_config": "provisioning/aws/aws_incoming.prod.json",
17 |   "primary_server": true,
18 |   "default_tags": {
19 |     "Owner": "mreid",
20 |     "Application": "telemetry-server"
21 |   },
22 |   "skip_termination": true
23 | }
24 | 


--------------------------------------------------------------------------------
/provisioning/aws/aws_telemetry_server_config.prod_secondary.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ssl_user": "ubuntu",
 3 |   "ssl_key_name": "mreid",
 4 |   "ssl_key_path": "~/.ssh/aws/mreid.pem",
 5 |   "ssl_retries": 10,
 6 |   "base_dir": "/mnt/telemetry",
 7 |   "instance_type": "m1.small",
 8 |   "image": "ami-ace67f9c",
 9 |   "security_groups": ["telemetry"],
10 |   "region": "us-west-2",
11 |   "placement": "us-west-2c",
12 |   "shutdown_behavior": "stop",
13 |   "name": "telemetry-server-v1-secondary-1",
14 |   "incoming_bucket": "telemetry-incoming-v2",
15 |   "incoming_queue": "telemetry-incoming-v1",
16 |   "process_incoming_config": "provisioning/aws/aws_incoming.prod.json",
17 |   "primary_server": false,
18 |   "default_tags": {
19 |     "Owner": "mreid",
20 |     "Application": "telemetry-server"
21 |   },
22 |   "skip_termination": true
23 | }
24 | 


--------------------------------------------------------------------------------
/provisioning/aws/create_ami.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | import time
 9 | from datetime import date
10 | 
11 | class AmiCreator:
12 |     def __init__(self, launcher):
13 |         self.launcher = launcher
14 | 
15 |     def create(self, description=None):
16 |         self.launcher.go()
17 |         conn = self.launcher.get_connection()
18 |         instance = self.launcher.get_instance()
19 |         print "Instance", instance.id, "is now configured. Stopping it."
20 |         stopping_instances = conn.stop_instances(instance_ids=[instance.id])
21 |         instance.update()
22 |         for i in range(120):
23 |             print i, "Instance is", instance.state
24 |             if instance.state == "stopped":
25 |                 break
26 |             time.sleep(1)
27 |             instance.update()
28 | 
29 |         print "Creating an AMI..."
30 |         # Create an AMI (after stopping the instance)
31 |         # Give it a good name %s-yyyymmdd where %s is instance name stolen from
32 |         # launcher which reads it from config or commandline
33 |         base_name = self.launcher.config["name"]
34 |         ami_name = "{0}-{1}".format(base_name, date.today().strftime("%Y%m%d"))
35 |         ami_desc = description
36 |         if ami_desc is None:
37 |             ami_desc = self.launcher.config.get("description")
38 |         if ami_desc is None:
39 |             ami_desc = 'Generic Telemetry Image'
40 |         # This automatically stops the image first (unless you tell it not to)
41 |         ami_id = conn.create_image(instance.id, ami_name, description=ami_desc)
42 |         print "Created a new AMI:"
43 |         print "    ID:", ami_id
44 |         print "  Name:", ami_name
45 |         print "  Desc:", ami_desc
46 |         # Get the image and wait for it to be available:
47 |         ami_image = conn.get_image(ami_id)
48 |         retry_count = 0
49 |         while retry_count < 15 and ami_image.state != "available":
50 |             retry_count += 1
51 |             print "AMI is", ami_image.state, "... waiting 10s for it to become available"
52 |             time.sleep(10)
53 |             ami_image.update()
54 |         print "AMI is", ami_image.state
55 |         if ami_image.state != "available":
56 |             print "The image is not quite available yet, but you're probably bored of waiting, so we'll continue."
57 |         # Now clean up the instance.
58 |         print "Terminating instance", instance.id
59 |         self.launcher.terminate(conn, instance)
60 |         print "Those AMI details again:"
61 |         print "    ID:", ami_id
62 |         print "  Name:", ami_name
63 |         print "  Desc:", ami_desc
64 |         return 0
65 | 


--------------------------------------------------------------------------------
/provisioning/aws/create_telemetry_base_ami.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | # Example invocation:
 9 | # $ cd /path/to/telemetry-server
10 | # $ python -m provisioning.aws.create_telemetry_base_ami -k "my_aws_key" -s "my_aws_secret" provisioning/aws/telemetry_server_base.pv.json
11 | 
12 | from launch_telemetry_server import TelemetryServerLauncher
13 | from create_ami import AmiCreator
14 | import sys
15 | import traceback
16 | 
17 | def main():
18 |     launcher = TelemetryServerLauncher()
19 |     creator = AmiCreator(launcher)
20 |     try:
21 |         result = creator.create('Pre-loaded image for telemetry nodes. Knows ' \
22 |                                 'how to run all the core services, but does ' \
23 |                                 'not auto-start them on boot.')
24 |         return result
25 |     except Exception, e:
26 |         print "Error:", e
27 |         traceback.print_exc()
28 |         return 1
29 | 
30 | if __name__ == "__main__":
31 |     sys.exit(main())
32 | 


--------------------------------------------------------------------------------
/provisioning/aws/create_telemetry_worker_ami.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | # Example invocation:
 9 | # $ cd /path/to/telemetry-server
10 | # $ python -m provisioning.aws.create_telemetry_worker_ami -k "my_aws_key" -s "my_aws_secret" provisioning/aws/telemetry_worker.hvm.json
11 | 
12 | from aws_launcher import Launcher
13 | from create_ami import AmiCreator
14 | import sys
15 | import traceback
16 | 
17 | def main():
18 |     launcher = Launcher()
19 |     creator = AmiCreator(launcher)
20 |     try:
21 |         result = creator.create('Pre-loaded image for telemetry workers. Use ' \
22 |                                 'it for scheduled or adhoc jobs.')
23 |         return result
24 |     except Exception, e:
25 |         print "Error:", e
26 |         traceback.print_exc()
27 |         return 1
28 | 
29 | if __name__ == "__main__":
30 |     sys.exit(main())
31 | 


--------------------------------------------------------------------------------
/provisioning/aws/launch_mapreduce_job.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | import boto.ec2
 9 | import time
10 | import os
11 | import simplejson as json
12 | from fabric.api import *
13 | from fabric.exceptions import NetworkError
14 | import sys
15 | import aws_util
16 | from aws_launcher import Launcher
17 | import traceback
18 | 
19 | class MapReduceLauncher(Launcher):
20 |     def post_install(self, instance):
21 |         base_dir = self.config.get("base_dir", "/mnt/telemetry")
22 |         with cd(base_dir):
23 |             # "data" is a dummy dir just to give it somewhere to look for local data.
24 |             run("mkdir job work data")
25 | 
26 |     def run(self, instance):
27 |         home = "/home/" + self.ssl_user
28 |         mr_cfg = self.config["mapreduce"]
29 |         base_dir = self.config.get("base_dir", "/mnt/telemetry")
30 |         job_dir = base_dir + "/job"
31 |         data_dir = base_dir + "/data"
32 |         work_dir = base_dir + "/work"
33 |         with cd(home + "/telemetry-server"):
34 |             job_script = mr_cfg["job_script"]
35 |             input_filter = mr_cfg["input_filter"]
36 |             put(job_script, job_dir)
37 |             put(input_filter, job_dir)
38 |             job_script_path = os.path.join(job_dir, os.path.basename(job_script))
39 |             input_filter_path = os.path.join(job_dir, os.path.basename(input_filter))
40 |             output_path = os.path.join(job_dir, "output.txt")
41 |             job_args = (job_script_path, input_filter_path, data_dir, work_dir, output_path, self.aws_key, self.aws_secret_key, mr_cfg["data_bucket"])
42 |             run('python -m mapreduce.job %s --input-filter %s --data-dir %s --work-dir %s --output %s --aws-key "%s" --aws-secret-key "%s" --bucket "%s"' % job_args)
43 |             # TODO: consult "output_compression"
44 |             run("xz " + output_path)
45 |             # TODO: upload job/output.txt.xz to S3 output_bucket.output_filename
46 |             result = get(output_path + ".xz", mr_cfg["output_filename"])
47 |             # TODO: check result.succeeded before bailing.
48 | 
49 | def main():
50 |     try:
51 |         launcher = MapReduceLauncher()
52 |         launcher.go()
53 |         return 0
54 |     except Exception, e:
55 |         print "Error:", e
56 |         traceback.print_exc()
57 |         return 1
58 | 
59 | if __name__ == "__main__":
60 |     sys.exit(main())
61 | 


--------------------------------------------------------------------------------
/provisioning/aws/process_incoming_queue.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | import time
 9 | from fabric.api import *
10 | import sys
11 | import aws_util
12 | from process_incoming_distributed import ProcessIncomingLauncher
13 | import traceback
14 | 
15 | 
16 | class ProcessIncomingQueueLauncher(ProcessIncomingLauncher):
17 |     def run(self, instance):
18 |         home = "/home/" + self.ssl_user
19 | 
20 |         # Update from github
21 |         with cd(home + "/telemetry-server"):
22 |             run("git pull")
23 | 
24 |         q_conn = aws_util.connect_sqs(self.config["region"], self.aws_key, self.aws_secret_key)
25 |         incoming_queue = q_conn.get_queue(self.config["incoming_queue"])
26 | 
27 |         if self.config.get("loop", False):
28 |             while True:
29 |                 if incoming_queue.count() == 0:
30 |                     print "No files to process yet. Sleeping for a while..."
31 |                     # TODO: Terminate 'instance' and fire up a new one when we need it?
32 |                     time.sleep(60)
33 |                     continue
34 |                 self.process_incoming_queue(instance)
35 |         else:
36 |             self.process_incoming_queue(instance)
37 | 
38 |     def process_incoming_queue(self, instance):
39 |         home = "/home/" + self.ssl_user
40 |         with cd(home + "/telemetry-server"):
41 |             skip_conversion = ""
42 |             if self.config.get("skip_conversion", False):
43 |                 skip_conversion = "--skip-conversion"
44 |             print "Processing incoming queue:", self.config["incoming_queue"]
45 |             run('python -m process_incoming.process_incoming_mp --bad-data-log /mnt/telemetry/bad_records.txt -k "%s" -s "%s" -r "%s" -w /mnt/telemetry/work -o /mnt/telemetry/processed -t ./telemetry/telemetry_schema.json -q "%s" %s %s %s' % (self.aws_key, self.aws_secret_key, self.config["region"], self.config["incoming_queue"], skip_conversion, self.config["incoming_bucket"], self.config["publish_bucket"]))
46 | 
47 | def main():
48 |     try:
49 |         launcher = ProcessIncomingQueueLauncher()
50 |         launcher.go()
51 |         return 0
52 |     except Exception, e:
53 |         print "Error:", e
54 |         traceback.print_exc()
55 |         return 1
56 | 
57 | if __name__ == "__main__":
58 |     sys.exit(main())
59 | 


--------------------------------------------------------------------------------
/provisioning/aws/telemetry_server_base.hvm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ssl_user": "ubuntu",
 3 |   "ssl_key_name": "mreid",
 4 |   "ssl_key_path": "~/.ssh/aws/mreid.pem",
 5 |   "ssl_retries": 10,
 6 |   "base_dir": "/mnt/telemetry",
 7 |   "instance_type": "c3.xlarge",
 8 |   "image": "ami-5189a661",
 9 |   "security_groups": ["telemetry"],
10 |   "region": "us-west-2",
11 |   "placement": "us-west-2c",
12 |   "shutdown_behavior": "stop",
13 |   "name": "telemetry-server-base-hvm",
14 |   "default_tags": {
15 |     "Owner": "mreid",
16 |     "Application": "telemetry-server"
17 |   },
18 |   "add_aws_credentials": true,
19 |   "skip_termination": true
20 | }
21 | 


--------------------------------------------------------------------------------
/provisioning/aws/telemetry_server_base.pv.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ssl_user": "ubuntu",
 3 |   "ssl_key_name": "mreid",
 4 |   "ssl_key_path": "~/.ssh/aws/mreid.pem",
 5 |   "ssl_retries": 10,
 6 |   "base_dir": "/mnt/telemetry",
 7 |   "instance_type": "t1.micro",
 8 |   "image": "ami-6989a659",
 9 |   "security_groups": ["telemetry"],
10 |   "region": "us-west-2",
11 |   "placement": "us-west-2c",
12 |   "shutdown_behavior": "stop",
13 |   "name": "telemetry-server-base-pv",
14 |   "default_tags": {
15 |     "Owner": "mreid",
16 |     "Application": "telemetry-server"
17 |   },
18 |   "add_aws_credentials": true,
19 |   "skip_termination": true
20 | }
21 | 


--------------------------------------------------------------------------------
/provisioning/aws/telemetry_worker.hvm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ssl_user": "ubuntu",
 3 |   "ssl_key_name": "mreid",
 4 |   "ssl_key_path": "~/.ssh/aws/mreid.pem",
 5 |   "ssl_retries": 10,
 6 |   "upgrade_os": true,
 7 |   "base_dir": "/mnt/telemetry",
 8 |   "instance_type": "c3.2xlarge",
 9 |   "image": "ami-2cfe1a1f",
10 |   "security_groups": ["telemetry"],
11 |   "region": "us-west-2",
12 |   "placement": "us-west-2c",
13 |   "shutdown_behavior": "stop",
14 |   "name": "telemetry-worker-hvm",
15 |   "default_tags": {
16 |     "Owner": "mreid",
17 |     "Application": "telemetry-server"
18 |   },
19 |   "skip_termination": true
20 | }
21 | 


--------------------------------------------------------------------------------
/provisioning/config/boto.cfg:
--------------------------------------------------------------------------------
1 | [Boto]
2 | debug = 0
3 | num_retries = 20
4 | metadata_service_timeout = 3.0
5 | metadata_service_num_attempts = 20
6 | 


--------------------------------------------------------------------------------
/provisioning/config/telemetry_aws.prod.json:
--------------------------------------------------------------------------------
1 | {
2 |   "incoming_bucket": "telemetry-incoming-v2",
3 |   "incoming_queue": "telemetry-incoming-v1",
4 |   "publish_bucket": "telemetry-published-v2",
5 |   "aws_region": "us-west-2"
6 | }
7 | 


--------------------------------------------------------------------------------
/server/server_config.spot.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "motd": " ==== Spot Telemetry Server. Accepting Submissions since 2013. ====",
 3 |   "max_data_lenth": 204800,
 4 |   "max_path_lenth": 10240,
 5 |   "port": 8080,
 6 |   "log_path": "/mnt/telemetry/data",
 7 |   "max_log_age_ms": 120000,
 8 |   "max_log_size": 50000000,
 9 |   "stats_log_file": "/var/log/telemetry/telemetry-server.log"
10 | }
11 | 


--------------------------------------------------------------------------------
/telemetry/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/telemetry/__init__.py


--------------------------------------------------------------------------------
/telemetry/telemetry_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     {
 5 |       "field_name": "reason",
 6 |       "allowed_values": ["idle-daily","saved-session","android-anr-report","ftu","loop","flash-video","appusage","main"]
 7 |     },
 8 |     {
 9 |       "field_name": "appName",
10 |       "allowed_values": ["Firefox","Fennec","Thunderbird","FirefoxOS","B2G"]
11 |     },
12 |     {
13 |       "field_name": "appUpdateChannel",
14 |       "allowed_values": ["release", "beta", "nightly", "aurora"]
15 |     },
16 |     {
17 |       "field_name": "appVersion",
18 |       "allowed_values": "*"
19 |     },
20 |     {
21 |       "field_name": "appBuildID",
22 |      "allowed_values": "*"
23 |     },
24 |     {
25 |       "field_name": "submission_date",
26 |       "allowed_values": "*"
27 |     }
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/telemetry/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/telemetry/util/__init__.py


--------------------------------------------------------------------------------
/telemetry/util/bench.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | import time
 9 | from contextlib import contextmanager
10 | @contextmanager
11 | def bench(label):
12 |     start = time.clock()
13 |     try:
14 |         yield
15 |     finally:
16 |         duration = time.clock() - start
17 |         print label, "Elapsed time:", duration, "seconds"
18 | 
19 | import re
20 | from string import maketrans
21 | x = "hello"
22 | trantab = maketrans("\r\n", "  ")
23 | eols = re.compile('[\r\n]')
24 | input2 = "FFFFFFFFFFFFFFFFFFFFFFFF   FFFFF" * 3000
25 | input = "FFFFFFFF\rFFFFFFF\nFF  FFF   \r\r\n\n" * 3000
26 | 
27 | with bench("Translate (with eols)"):
28 |     for i in range(10000):
29 |         if "\r" in input or "\n" in input:
30 |             x = input.translate(trantab)
31 | 
32 | with bench("Replace (with eols)"):
33 |     for i in range(10000):
34 |         if "\r" in input or "\n" in input:
35 |             x = input.replace("\r", " ").replace("\n", " ")
36 | 
37 | with bench("Regex (with eols)"):
38 |     for i in range(10000):
39 |         if "\r" in input or "\n" in input:
40 |             x, count = eols.subn(" ", input)
41 | 
42 | with bench("Translate (no eols)"):
43 |     for i in range(10000):
44 |         if "\r" in input2 or "\n" in input2:
45 |             x = input2.translate(trantab)
46 | 
47 | with bench("Replace (no eols)"):
48 |     for i in range(10000):
49 |         if "\r" in input2 or "\n" in input2:
50 |             x = input2.replace("\r", " ").replace("\n", " ")
51 | 
52 | with bench("Regex (no eols)"):
53 |     for i in range(10000):
54 |         if "\r" in input2 or "\n" in input2:
55 |             x, count = eols.subn(" ", input2)
56 | 


--------------------------------------------------------------------------------
/telemetry/util/bucket_list.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import socket
 3 | import sys
 4 | import traceback
 5 | import boto
 6 | from boto.s3.connection import S3Connection
 7 | from datetime import datetime
 8 | import telemetry.util.timer as timer
 9 | 
10 | def s3obj_to_string(key):
11 |     return u"\t".join((key.name, str(key.size), key.etag[1:-1]))
12 | 
13 | # Update all files on or after submission date.
14 | def list_files(bucket_name, output_file, output_func=s3obj_to_string, prefix=''):
15 |     s3 = S3Connection()
16 |     bucket = s3.get_bucket(bucket_name)
17 |     total_count = 0
18 |     start_time = datetime.now()
19 |     done = False
20 |     last_key = ''
21 |     while not done:
22 |         try:
23 |             for k in bucket.list(prefix=prefix, marker=last_key):
24 |                 last_key = k.name
25 |                 total_count += 1
26 |                 if total_count % 5000 == 0:
27 |                     print "Looked at", total_count, "total records in", timer.delta_sec(start_time), "seconds. Last key was", last_key
28 |                 try:
29 |                     output_file.write(str(output_func(k)) + "\n")
30 |                 except Exception, e:
31 |                     print "Error writing key", k.name, ":", e
32 |                     traceback.print_exc()
33 |             done = True
34 |         except socket.error, e:
35 |             print "Error listing keys:", e
36 |             traceback.print_exc()
37 |             print "Continuing from last seen key:", last_key
38 | 
39 |     output_file.close()
40 |     print "Overall, listed", total_count, "in", timer.delta_sec(start_time), "seconds"
41 | 
42 | def main():
43 |     parser = argparse.ArgumentParser(description="List S3 contents (with retry) to a file")
44 |     parser.add_argument("--output-file", type=argparse.FileType('w'))
45 |     parser.add_argument("--bucket", default="telemetry-published-v2")
46 |     parser.add_argument("--prefix", default="")
47 |     parser.add_argument("--verbose", action="store_true")
48 |     parser.add_argument("--debug", action="store_true")
49 |     args = parser.parse_args()
50 |     
51 |     if args.debug:
52 |         boto.set_stream_logger('boto')
53 | 
54 |     list_files(args.bucket, args.output_file, prefix=args.prefix)
55 | 
56 | if __name__ == "__main__":
57 |     sys.exit(main())
58 | 


--------------------------------------------------------------------------------
/telemetry/util/convert_log_v0_to_v1.py:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | import sys, struct
 6 | 
 7 | fin = open(sys.argv[1], "rb")
 8 | fout = open(sys.argv[2], "wb")
 9 | 
10 | record_count = 0;
11 | while True:
12 |     header = fin.read(16)
13 |     if header == '':
14 |         break
15 |     record_count += 1
16 | 
17 |     len_path, len_data, timestamp = struct.unpack("<IIQ", header)
18 | 
19 |     # The "<" is to force it to read as Little-endian to match the way it's
20 |     # written. This is the "native" way in linux too, but might as well make
21 |     # sure we read it back the same way.
22 |     # Write 1 + 2 + 4 + 8 bytes
23 |     packed = struct.pack("<BHIQ", 0x1e, len_path, len_data, timestamp)
24 |     fout.write(packed)
25 |     fout.write(fin.read(len_path))
26 |     fout.write(fin.read(len_data))
27 | 
28 |     print "Wrote record", record_count
29 | 


--------------------------------------------------------------------------------
/telemetry/util/heka_message_parser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | import simplejson as json
 9 | 
10 | def parse_heka_record(record):
11 |     result = json.loads(record.message.payload)
12 |     result["meta"] = {
13 |         # TODO: uuid, logger, severity, env_version, pid
14 |         "Timestamp": record.message.timestamp,
15 |         "Type":      record.message.type,
16 |         "Hostname":  record.message.hostname,
17 |     }
18 | 
19 |     for field in record.message.fields:
20 |         name = field.name.split('.')
21 |         value = _get_field_value(field)
22 | 
23 | 	if len(name) == 1:  # Treat top-level meta fields as strings
24 | 	    result["meta"][name[0]] = value
25 | 	else:
26 | 	    _add_field(result, name, value)
27 | 
28 |     return result
29 | 
30 | def _get_field_value(field):
31 |     value = field.value_string
32 |     if field.value_type == 1:
33 |         # TODO: Figure out how to support byte fields without blowing up. For
34 |         #       now, work around by skipping byte fields.
35 |         return ""
36 |     elif field.value_type == 2:
37 |         value = field.value_integer
38 |     elif field.value_type == 3:
39 |         value = field.value_double
40 |     elif field.value_type == 4:
41 |         value = field.value_bool
42 |     if len(value):
43 |         return value[0]
44 |     return ""
45 | 
46 | def _add_field(container, keys, value):
47 |     if len(keys) == 1:
48 |         container[keys[0]] = _lazyjson(value)
49 |         return
50 | 
51 |     key = keys.pop(0)
52 |     container[key] = container.get(key, {})
53 |     _add_field(container[key], keys, value)
54 | 
55 | 
56 | def _lazyjson(content):
57 |     if not isinstance(content, basestring):
58 |         raise ValueError("Argument must be a string.")
59 | 
60 |     if content.startswith("{"):
61 |         default = {}
62 |     elif content.startswith("["):
63 |         default = []
64 |     else:
65 |         try:
66 |             return float(content) if '.' in content or 'e' in content.lower() else int(content)
67 |         except:
68 |             return content
69 | 
70 |     class WrapperType(type(default)):
71 |         pass
72 | 
73 |     def wrap(method_name):
74 |         def _wrap(*args, **kwargs):
75 |             if not hasattr(WrapperType, '__cache__'):
76 |                 setattr(WrapperType, '__cache__', json.loads(content))
77 | 
78 |             cached = WrapperType.__cache__
79 |             method = getattr(cached, method_name)
80 |             return method(*args[1:], **kwargs)
81 | 
82 |         return _wrap
83 | 
84 |     wrapper = WrapperType(default)
85 |     for k, v in type(default).__dict__.iteritems():
86 |         if k == "__doc__":
87 |             continue
88 |         else:
89 |             setattr(WrapperType, k, wrap(k))
90 |     return wrapper
91 | 


--------------------------------------------------------------------------------
/telemetry/util/lists.py:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | def split(big_list, split_size):
 6 |         split_list = []
 7 |         len_big_list = len(big_list)
 8 |         current = 0
 9 |         while current + split_size < len_big_list:
10 |             split_list.append(big_list[current:current+split_size])
11 |             current += split_size
12 |         if current < len_big_list:
13 |             split_list.append(big_list[current:])
14 |         return split_list
15 | 


--------------------------------------------------------------------------------
/telemetry/util/pack_log.py:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | import sys, struct, gzip, time
 6 | import StringIO as StringIO
 7 | 
 8 | fin = open(sys.argv[1], "rb")
 9 | fout = open(sys.argv[2], "wb")
10 | 
11 | record_count = 0;
12 | while True:
13 |     record_count += 1
14 |     line = fin.readline()
15 |     if line == '':
16 |         break
17 |     else:
18 |         line = line.strip()
19 | 
20 |     [path, data] = line.split("\t", 1)
21 | 
22 |     # The "<" is to force it to read as Little-endian to match the way it's
23 |     # written. This is the "native" way in linux too, but might as well make
24 |     # sure we read it back the same way.
25 |     # Write 1 + 2 + 4 + 8 bytes
26 |     packed = struct.pack("<BHIQ", 0x1e, len(path), len(data), int(round(time.time() * 1000)))
27 |     fout.write(packed)
28 |     fout.write(path)
29 |     fout.write(data)
30 | 
31 |     print "Wrote record", record_count
32 | 


--------------------------------------------------------------------------------
/telemetry/util/timer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | from datetime import datetime
 9 | 
10 | def delta_ms(start, end=None):
11 |     if end is None:
12 |         end = datetime.now()
13 |     delta = end - start
14 |     ms = delta.seconds * 1000.0 + float(delta.microseconds) / 1000.0
15 |     # prevent division-by-zero errors by cheating:
16 |     if ms == 0.0:
17 |         return 0.0001
18 |     return ms
19 | 
20 | def delta_sec(start, end=None):
21 |     return delta_ms(start, end) / 1000.0
22 | 


--------------------------------------------------------------------------------
/telemetry/util/unpack_log.py:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | import sys, struct, gzip
 6 | import StringIO as StringIO
 7 | import simplejson as json
 8 | import files as fu
 9 | 
10 | filename = sys.argv[1]
11 | parse = False
12 | if len(sys.argv) > 2 and sys.argv[2] == 'parse':
13 |     parse = True
14 | 
15 | record_count = 0
16 | bad_records = 0
17 | version = fu.detect_file_version(filename)
18 | print "It appears that this is a", version, "log file."
19 | for r in fu.unpack(filename, verbose=True, file_version=version):
20 |     record_count += 1
21 |     if r.error:
22 |         print "Record", record_count, "was bad:", r.error
23 |         bad_records += 1
24 |         continue
25 | 
26 |     if parse:
27 |         try:
28 |             parsed_json = json.loads(r.data)
29 |         except Exception, e:
30 |             bad_records += 1
31 |             print "Record", record_count, "failed to parse json:", e
32 | 
33 | print "Processed", record_count, "records, with", bad_records, "bad records"
34 | 


--------------------------------------------------------------------------------
/test/test.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/test/test.txt.gz


--------------------------------------------------------------------------------
/test/test.txt.lzma:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/test/test.txt.lzma


--------------------------------------------------------------------------------
/test/test.txt.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/test/test.txt.xz


--------------------------------------------------------------------------------
/test/unicode.v1.packed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/test/unicode.v1.packed


--------------------------------------------------------------------------------