├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── TODO.md ├── __init__.py ├── analysis ├── __init__.py ├── analysis-worker-stack.yaml ├── downloader.py ├── example │ ├── processor │ └── test │ │ ├── input.txt │ │ ├── ss-ff-n-22.lzma │ │ ├── ss-ff-n-28.lzma │ │ └── test-processor.sh ├── helpers.py ├── launcher.py ├── makefile ├── manager.py ├── test-filter.json ├── utils.py └── worker.py ├── bin └── get_histogram_tools.sh ├── cmake ├── FindLZMA.cmake ├── FindProtobuf.cmake ├── doxygen.cmake ├── externals.cmake ├── mozsvc.cmake └── rapidjson-0_11.patch ├── docs ├── BagheeraIntegration.md ├── CompressionBenchmarks.md ├── Deduplication.md ├── MapReduce.md ├── PayloadFormat.md ├── ProcessIncoming.md ├── StorageFormat.md ├── StorageLayout.md ├── data_flow.png ├── data_flow.svg └── telemetry_logo.svg ├── http ├── __init__.py ├── analysis-service │ ├── analysis-resources.yaml │ ├── analysis-service-stack.yaml │ ├── config.py │ ├── crontab.py │ ├── jobs │ │ └── run.sh │ ├── makefile │ ├── requirements │ ├── server.py │ ├── static │ │ ├── cluster.png │ │ ├── schedule.png │ │ ├── style.css │ │ └── worker.png │ ├── templates │ │ ├── base.html │ │ ├── boot-script.sh │ │ ├── cluster │ │ │ ├── cluster.html │ │ │ ├── email.html │ │ │ ├── kill.html │ │ │ ├── monitor.html │ │ │ ├── schedule.html │ │ │ ├── schedule_create.html │ │ │ ├── schedule_delete.html │ │ │ └── schedule_files.html │ │ ├── index.html │ │ ├── instance-launched-email.html │ │ ├── kill.html │ │ ├── macros.html │ │ ├── monitor.html │ │ ├── schedule.html │ │ ├── schedule_create.html │ │ ├── schedule_delete.html │ │ ├── schedule_files.html │ │ └── worker.html │ ├── terminate-expired-instances.py │ └── user.py ├── histogram_server.py ├── relay.js ├── server.js └── server_config.json ├── mapreduce ├── __init__.py ├── addon_perf │ ├── README.md │ ├── addon-perf.json │ ├── addon-scan.json │ ├── addon-versions.py │ ├── addon_perf.py │ ├── combine.py │ ├── filter_template.json │ ├── package.sh │ ├── processAddonPerf.sh │ ├── run-fork.sh │ └── run.sh ├── chromehangs │ ├── chromehangs.py │ ├── combine.py │ ├── combine_week.sh │ ├── extract_common_stacks.py │ ├── filter_template.json │ ├── package.sh │ ├── run.sh │ ├── run_public.sh │ ├── symbolicate.py │ └── test_symbolicate.py ├── examples │ ├── heka │ │ ├── distribution.py │ │ ├── filter.json │ │ └── run.sh │ └── v2 │ │ ├── dims_only.py │ │ ├── distribution.py │ │ ├── filter-nightly-buildid.json │ │ ├── filter_include_all.json │ │ ├── filter_max_buildid.json │ │ ├── filter_min_buildid.json │ │ ├── filter_min_max_buildid.json │ │ ├── filter_saved_session_Fx_prerelease.json │ │ ├── osdistribution.py │ │ ├── simple_counter.py │ │ └── trivial.py ├── experiments │ ├── experiments.py │ ├── filter_template.json │ ├── postprocess.py │ └── run.sh ├── flash │ ├── csv_header.txt │ ├── filter_template.json │ ├── flash_versions.py │ ├── package.sh │ └── run.sh ├── fxosping │ ├── csv_header.txt │ ├── filter_template.json │ ├── fxosping.py │ ├── package.sh │ └── run.sh ├── hekajob.py ├── job.py ├── loop_failure_summary │ ├── failures_by_type.py │ ├── filter_template.json │ ├── header.txt │ ├── run.sh │ └── summarize.py ├── mainthreadio │ ├── csv_header.txt │ ├── filter_template.json │ ├── mainthreadio.py │ ├── package.sh │ ├── run.sh │ └── summary.py └── slowsql │ ├── csv_header.txt │ ├── filter_template.json │ ├── package.sh │ ├── run.sh │ └── slowsql.py ├── mongodb ├── __init__.py ├── examples │ └── osdistribution.js └── importer.py ├── monitoring ├── __init__.py ├── anomaly_detection │ ├── detect.py │ └── notify.py ├── expire_flash_video │ ├── __init__.py │ └── expire_flash_video.py ├── heka │ ├── common.toml │ ├── incoming_stats.toml │ ├── lua_decoders │ │ ├── telemetry_incoming_stats.lua │ │ └── telemetry_server.lua │ ├── lua_filters │ │ ├── telemetry_channel_metrics.lua │ │ ├── telemetry_server_metrics.lua │ │ ├── telemetry_stats_bytes.lua │ │ ├── telemetry_stats_errors.lua │ │ └── telemetry_stats_records.lua │ └── server.toml ├── process_incoming │ ├── error_rates.py │ ├── sample_data │ │ ├── TelemetryStatsErrorsAggregator.bad_payload.cbuf │ │ ├── TelemetryStatsErrorsAggregator.conversion_error.cbuf │ │ ├── TelemetryStatsErrorsAggregator.corrupted_data.cbuf │ │ ├── TelemetryStatsErrorsAggregator.empty_data.cbuf │ │ ├── TelemetryStatsErrorsAggregator.invalid_path.cbuf │ │ ├── TelemetryStatsErrorsAggregator.missing_revision.cbuf │ │ ├── TelemetryStatsErrorsAggregator.missing_revision_repo.cbuf │ │ ├── TelemetryStatsErrorsAggregator.uuid_only_path.cbuf │ │ ├── TelemetryStatsErrorsAggregator.write_failed.cbuf │ │ └── TelemetryStatsRecordsAggregator.ReaderALL.cbuf │ └── viz │ │ ├── css │ │ ├── bootstrap.css │ │ ├── metrics-graphics-demo.css │ │ └── metrics-graphics.css │ │ ├── data │ │ └── errors.example.json │ │ ├── index.html │ │ └── js │ │ ├── main.js │ │ └── metrics-graphics.js ├── sanitize_fxos │ ├── __init__.py │ └── sanitize_fxos_pings.py └── telemetry.mozilla.org │ └── check_last_update.py ├── process_incoming ├── __init__.py ├── process_incoming_mp.py ├── process_incoming_serial.py ├── process_incoming_standalone.py └── worker │ ├── CMakeLists.txt │ ├── ConvertConfig.cpp │ ├── ConvertConfig.h │ ├── common │ ├── CMakeLists.txt │ ├── Common.h │ ├── CompressedFileWriter.cpp │ ├── CompressedFileWriter.h │ ├── HekaLogger.cpp │ ├── HekaLogger.h │ ├── HistogramCache.cpp │ ├── HistogramCache.h │ ├── HistogramConverter.cpp │ ├── HistogramConverter.h │ ├── HistogramSpecification.cpp │ ├── HistogramSpecification.h │ ├── Logger.h │ ├── Metric.cpp │ ├── Metric.h │ ├── RecordWriter.cpp │ ├── RecordWriter.h │ ├── TelemetryConstants.h │ ├── TelemetryConstants.in.cpp │ ├── TelemetryRecord.cpp │ ├── TelemetryRecord.h │ ├── TelemetrySchema.cpp │ ├── TelemetrySchema.h │ ├── message.pb.cc │ ├── message.pb.h │ ├── message.proto │ └── test │ │ ├── CMakeLists.txt │ │ ├── TestConfig.in.h │ │ ├── TestHistogramCache.cpp │ │ ├── TestHistogramConverter.cpp │ │ ├── TestHistogramSpecification.cpp │ │ ├── TestRecordWriter.cpp │ │ ├── TestTelemetryRecord.cpp │ │ ├── TestTelemetrySchema.cpp │ │ └── data │ │ ├── 8d3810543edc.json.FIREFOX_AURORA_24_BASE │ │ ├── a55c55edf302.json │ │ ├── a55c55edf302.json.FIREFOX_AURORA_23_BASE │ │ ├── ad0ae007aa9e.json.FIREFOX_AURORA_25_BASE │ │ ├── cache │ │ ├── 8d3810543edc.json │ │ ├── a55c55edf302.json │ │ └── ad0ae007aa9e.json │ │ ├── invalid.json │ │ ├── invalid_kind.json │ │ ├── invalid_schema.json │ │ ├── missing_kind.json │ │ ├── telemetry1.log │ │ └── telemetry_schema.json │ ├── convert.cpp │ └── convert.json ├── provisioning ├── __init__.py ├── ansible │ ├── README.md │ ├── envs │ │ ├── dev.yml │ │ └── dev_secrets.example.yml │ ├── hosts │ ├── playbooks │ │ ├── app.yml │ │ ├── build_ami.yml │ │ ├── make_code_package.yml │ │ ├── resources.yml │ │ └── route53.yml │ └── templates │ │ └── route53.json ├── aws │ ├── __init__.py │ ├── aws_incoming.example.json │ ├── aws_incoming.prod.json │ ├── aws_launcher.py │ ├── aws_telemetry_server_config.example.json │ ├── aws_telemetry_server_config.prod.json │ ├── aws_telemetry_server_config.prod_secondary.json │ ├── aws_util.py │ ├── create_ami.py │ ├── create_telemetry_base_ami.py │ ├── create_telemetry_worker_ami.py │ ├── launch_mapreduce_job.py │ ├── launch_telemetry_server.py │ ├── launch_worker.py │ ├── process_incoming_distributed.py │ ├── process_incoming_queue.py │ ├── telemetry_server_base.hvm.json │ ├── telemetry_server_base.pv.json │ └── telemetry_worker.hvm.json ├── cloudformation │ ├── telemetry-regression-alerts.json │ └── telemetry-server-stack.json └── config │ ├── boto.cfg │ └── telemetry_aws.prod.json ├── server └── server_config.spot.json ├── telemetry ├── __init__.py ├── convert.py ├── infoFieldsMap.py ├── persist.py ├── revision_cache.py ├── telemetry_schema.json ├── telemetry_schema.py ├── test_convert.py ├── test_persist.py ├── test_revision_cache.py ├── test_telemetry_schema.py └── util │ ├── __init__.py │ ├── bench.py │ ├── benchmark_server.py │ ├── bucket_list.py │ ├── cf-yaml-helper.py │ ├── compress.py │ ├── convert_local_pings.py │ ├── convert_log_v0_to_v1.py │ ├── export.py │ ├── files.py │ ├── heka_message.py │ ├── heka_message_parser.py │ ├── lists.py │ ├── message_pb2.py │ ├── pack_log.py │ ├── s3.py │ ├── split_raw_log.py │ ├── test_compress.py │ ├── test_downloader.py │ ├── timer.py │ └── unpack_log.py └── test ├── test.txt.gz ├── test.txt.lzma ├── test.txt.xz └── unicode.v1.packed /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.gz 3 | *.bz2 4 | *.xz 5 | *.lzma 6 | *.swp 7 | *.swo 8 | *.out 9 | *.o 10 | *~ 11 | .idea/ 12 | histogram_cache/ 13 | http/analysis-service/analysis-resources.json 14 | http/analysis-service/analysis-service-stack.json 15 | http/analysis-service/telemetry-analysis-service.tar.gz 16 | node_modules/ 17 | htmlcov/ 18 | histogram_tools.py 19 | .DS_Store 20 | CMakeFiles 21 | provisioning/ansible/envs/dev_secrets.yml 22 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | cmake_minimum_required(VERSION 2.8 FATAL_ERROR) 6 | project(telemetry) 7 | set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "telemetry-server") 8 | set(CPACK_PACKAGE_VERSION_MAJOR 0) 9 | set(CPACK_PACKAGE_VERSION_MINOR 1) 10 | set(CPACK_PACKAGE_VERSION_PATCH 0) 11 | 12 | set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") 13 | include(mozsvc) 14 | include(externals) 15 | 16 | find_package (Threads) 17 | find_package(ZLIB REQUIRED) 18 | find_package(LZMA REQUIRED) 19 | find_package(OpenSSL REQUIRED) 20 | find_package(Protobuf 2.3 REQUIRED) 21 | find_package(Boost 1.54.0 REQUIRED log filesystem system thread unit_test_framework regex) 22 | 23 | include_directories(${Boost_INCLUDE_DIRS} ${OPENSSL_INCLUDE_DIR} "${CMAKE_SOURCE_DIR}/process_incoming/worker/common") 24 | 25 | add_subdirectory(process_incoming/worker) 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | http://www.mozilla.org/MPL/2.0/index.txt 2 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | TODO 2 | ==== 3 | 4 | - [P2] Add "number of records" to exported filenames 5 | - [P2] Figure out idle-daily de-duplication 6 | - [P2] Supply the correct Histograms.json spec for each record to the Mapper 7 | - [P2] MapReduce: delete downloaded data files after they have been processed. 8 | - [P2] Improve speed of the conversion process 9 | - [P3] Have the "process_incoming" job write bad input records back to S3 10 | - [P3] Stream data from S3 for MapReduce instead of downloading first 11 | - [P3] Add timeout/retry around fetching Histograms.json from hg.mozilla.org 12 | - [P3] Add many tests 13 | - [P3] Add runtime performance metrics 14 | - [P3] Ensure things are in order to accept Addon Histograms, such as 15 | from [pdf.js][5] 16 | - [P4] Change the RevisionCache to fetch the entire history of Histograms.json 17 | and then convert incoming revisions to times to find the right version 18 | 19 | [1]: https://github.com/Cue/scales "Scales" 20 | [2]: http://docs.python.org/2/library/logging.html "Python Logging" 21 | [3]: http://docs.python.org/2/library/profile.html "Python Profilers" 22 | [5]: https://github.com/mozilla/pdf.js/pull/3532/files#L1R29 23 | [7]: http://docs.aws.amazon.com/AmazonS3/latest/dev/object-lifecycle-mgmt.html 24 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/__init__.py -------------------------------------------------------------------------------- /analysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/analysis/__init__.py -------------------------------------------------------------------------------- /analysis/downloader.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process 2 | from boto.s3.connection import S3Connection 3 | from traceback import print_exc 4 | from utils import mkdirp 5 | import os, sys 6 | 7 | class DownloaderProcess(Process): 8 | """ Worker process that download files from queue to folder """ 9 | def __init__(self, input_queue, output_queue, 10 | work_folder, aws_cred): 11 | super(DownloaderProcess, self).__init__() 12 | self.input_queue = input_queue 13 | self.output_queue = output_queue 14 | self.work_folder = work_folder 15 | mkdirp(self.work_folder) 16 | self.input_bucket = "telemetry-published-v2" 17 | self.aws_cred = aws_cred 18 | self.s3 = S3Connection(**self.aws_cred) 19 | self.bucket = self.s3.get_bucket(self.input_bucket, validate = False) 20 | 21 | def run(self): 22 | while True: 23 | prefix = self.input_queue.get() 24 | self.download(prefix) 25 | 26 | def download(self, prefix): 27 | # Get filename from prefix 28 | filename = os.path.basename(prefix) 29 | # Get target path 30 | target = os.path.join(self.work_folder, filename) 31 | # Download file 32 | retries = 1 33 | success = False 34 | while retries < 3: 35 | try: 36 | k = self.bucket.get_key(prefix) 37 | k.get_contents_to_filename(target) 38 | success = True 39 | break 40 | except: 41 | retries += 1 42 | print >> sys.stderr, "Error on %i'th try:" % retries 43 | print_exc(file = sys.stderr) 44 | 45 | if success: 46 | # Put file to output query 47 | self.output_queue.put((prefix, target)) 48 | else: 49 | print >> sys.stderr, "Failed to download: %s" % prefix 50 | self.output_queue.put((prefix, None)) 51 | -------------------------------------------------------------------------------- /analysis/example/processor: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Number of rows counted 4 | rows=0; 5 | 6 | # Read stdin line by line 7 | while read -r line; do 8 | 9 | # Skip empty-lines, last line may be empty 10 | if [ "$line" == "" ]; then 11 | continue; 12 | fi; 13 | 14 | # Split input 15 | prefix=`echo "$line" | cut -f 1`; 16 | path=`echo "$line" | cut -f 2`; 17 | 18 | # Count number of rows 19 | new_rows=`xz -dc $path | wc -l`; 20 | rows=$(($rows + $new_rows)); 21 | 22 | # Delete input file 23 | rm $path; 24 | done; 25 | 26 | # Output aggregated values to files in folder provided as first argument 27 | echo "$rows" > $1/rows_counted.txt; 28 | -------------------------------------------------------------------------------- /analysis/example/test/input.txt: -------------------------------------------------------------------------------- 1 | saved_session/Firefox/nightly/22.0a1/20130226031002.20131011.v2.log.e28a4032eb744f089a1828ac7399e5d8.lzma input/ss-ff-n-22.lzma 2 | saved_session/Firefox/nightly/28.0a1/20131029030201.20131029.v2.log.0ab8723b6fb3455bb34b04d97482fda2.lzma input/ss-ff-n-28.lzma 3 | -------------------------------------------------------------------------------- /analysis/example/test/ss-ff-n-22.lzma: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/analysis/example/test/ss-ff-n-22.lzma -------------------------------------------------------------------------------- /analysis/example/test/ss-ff-n-28.lzma: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/analysis/example/test/ss-ff-n-28.lzma -------------------------------------------------------------------------------- /analysis/example/test/test-processor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 3 | 4 | # This tests a processor, and takes a processor job_bundle as input argument 5 | # that is a tarball containing a script called `processor` which is to be given 6 | # files as input and output results into a single file 7 | 8 | echo "### Setting up test environment"; 9 | 10 | # Create test-folders 11 | mkdir -p test-folder/input; 12 | mkdir -p test-folder/output; 13 | 14 | # Copy in job_bundle 15 | cp $1 test-folder/job_bundle.tar.gz 16 | 17 | # Copy in test files 18 | cp $DIR/ss-ff-n-22.lzma test-folder/input/ss-ff-n-22.lzma 19 | cp $DIR/ss-ff-n-28.lzma test-folder/input/ss-ff-n-28.lzma 20 | 21 | # Extract job_bundle 22 | cd test-folder; 23 | tar -xzf job_bundle.tar.gz; 24 | 25 | # Run tests 26 | echo "### Running processor"; 27 | cat $DIR//input.txt | ./processor output/; 28 | 29 | echo "### Files produced"; 30 | find output/; 31 | 32 | if [ `ls input/ | wc -l` -ne "0" ]; then 33 | echo "### WARNING"; 34 | echo "Input files where not deleted, please do this as they are consumed."; 35 | fi; 36 | -------------------------------------------------------------------------------- /analysis/helpers.py: -------------------------------------------------------------------------------- 1 | try: 2 | import simplejson as json 3 | except ImportError: 4 | import json 5 | from subprocess import Popen, PIPE 6 | from traceback import print_exc 7 | import sys 8 | 9 | def decompress_input(process): 10 | def wrapper(self, prefix, path): 11 | # Find dimensions 12 | dims = prefix.split('/') 13 | dims += dims.pop().split('.')[:2] 14 | 15 | # Open a compressor 16 | raw_handle = open(path, "rb") 17 | decompressor = Popen( 18 | ['xz', '-d', '-c'], 19 | bufsize = 65536, 20 | stdin = raw_handle, 21 | stdout = PIPE, 22 | stderr = sys.stderr 23 | ) 24 | 25 | # Process each line 26 | line_nb = 0 27 | errors = 0 28 | for line in decompressor.stdout: 29 | line_nb += 1 30 | try: 31 | uid, payload = line.split("\t", 1) 32 | process(self, uid, dims, payload) 33 | except: 34 | print >> sys.stderr, ("Bad input line: %i of %s" % 35 | (line_nb, prefix)) 36 | print_exc(file = sys.stderr) 37 | errors += 1 38 | 39 | # Close decompressor 40 | decompressor.stdout.close() 41 | raw_handle.close() 42 | 43 | # Return number of failed records 44 | return errors 45 | return wrapper 46 | 47 | def parse_input(process): 48 | def wrapper(self, uid, dimensions, payload): 49 | process(self, uid, dimensions, json.loads(payload)) 50 | return decompress_input(wrapper) 51 | 52 | class Processor: 53 | def __init__(self, output_folder): 54 | self.output_folder = output_folder 55 | 56 | def process(self, prefix, path): 57 | # Raise exception on critical crash error 58 | # Print errors to stderr 59 | return 0 # number of errors (rows we had problems parsing) 60 | 61 | @decompress_input 62 | def process(self, uid, dimensions, payload): 63 | pass # Raise exception on error 64 | 65 | @parse_input 66 | def process(self, uid, dimensions, json): 67 | pass # Raise exception on error 68 | 69 | def flush(self): 70 | pass 71 | -------------------------------------------------------------------------------- /analysis/makefile: -------------------------------------------------------------------------------- 1 | CFYAML = ../telemetry/util/cf-yaml-helper.py 2 | SOURCES_BUCKET = jonasfj-telemetry-code 3 | VERSION = 1 4 | 5 | analysis-worker-stack.json: analysis-worker-stack.yaml 6 | $(CFYAML) $< > $@ 7 | 8 | put: analysis-worker-stack.json 9 | aws s3 cp analysis-worker-stack.json s3://$(SOURCES_BUCKET)/v$(VERSION)/analysis-worker-stack.json 10 | 11 | clean: 12 | rm -f analysis-worker-stack.json 13 | 14 | .PHONY: put 15 | -------------------------------------------------------------------------------- /analysis/test-filter.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["saved_session"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": "*" 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": ["release", "aurora", "nightly", "beta", "nightly-ux"] 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": "*" 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": ["20131013"] 27 | } 28 | ] 29 | } 30 | 31 | -------------------------------------------------------------------------------- /analysis/utils.py: -------------------------------------------------------------------------------- 1 | from errno import EEXIST 2 | from multiprocessing import active_children, current_process 3 | import os 4 | 5 | def mkdirp(path): 6 | try: 7 | os.makedirs(path) 8 | except OSError as e: 9 | if e.errno != EEXIST or not os.path.isdir(path): 10 | raise 11 | 12 | 13 | -------------------------------------------------------------------------------- /bin/get_histogram_tools.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget -c https://hg.mozilla.org/mozilla-central/raw-file/6dc53d54f027/toolkit/components/telemetry/histogram_tools.py -O histogram_tools.py 4 | -------------------------------------------------------------------------------- /cmake/FindLZMA.cmake: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | # The module defines the following variables 6 | # LZMA_INCLUDE_DIR 7 | # LZMA_LIBRARIES 8 | # LZMA_FOUND 9 | 10 | IF (LZMA_INCLUDE_DIR) 11 | SET(LZMA_FIND_QUIETLY TRUE) 12 | ENDIF (LZMA_INCLUDE_DIR) 13 | 14 | FIND_PATH(LZMA_INCLUDE_DIR lzma.h) 15 | FIND_LIBRARY(LZMA_LIBRARY NAMES lzma ) 16 | 17 | INCLUDE(FindPackageHandleStandardArgs) 18 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(LZMA DEFAULT_MSG LZMA_LIBRARY LZMA_INCLUDE_DIR) 19 | 20 | IF(LZMA_FOUND) 21 | SET( LZMA_LIBRARIES ${LZMA_LIBRARY} ) 22 | ELSE(LZMA_FOUND) 23 | SET( LZMA_LIBRARIES ) 24 | ENDIF(LZMA_FOUND) 25 | 26 | MARK_AS_ADVANCED(LZMA_LIBRARY LZMA_INCLUDE_DIR) 27 | -------------------------------------------------------------------------------- /cmake/FindProtobuf.cmake: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | # The module defines the following variables: 6 | # PROTOBUF_FOUND - true if the Protobuf was found 7 | # PROTOBUF_EXECUTABLE - path to the executable 8 | # PROTOBUF_VERSION - Protobuf version number 9 | # PROTOBUF_LIBRARIES 10 | # PROTOBUF_INCLUDE_DIR 11 | # Example usage: 12 | # find_package(Protobuf 2.3 REQUIRED) 13 | 14 | 15 | find_program(PROTOBUF_EXECUTABLE protoc PATH_SUFFIXES bin) 16 | if (PROTOBUF_EXECUTABLE) 17 | execute_process(COMMAND ${PROTOBUF_EXECUTABLE} --version OUTPUT_VARIABLE PROTOBUF_VERSION_OUTPUT OUTPUT_STRIP_TRAILING_WHITESPACE) 18 | if(PROTOBUF_VERSION_OUTPUT MATCHES "libprotoc ([0-9]+\\.[0-9]+\\.[0-9]+)") 19 | set(PROTOBUF_VERSION ${CMAKE_MATCH_1}) 20 | endif() 21 | endif() 22 | mark_as_advanced(PROTOBUF_EXECUTABLE) 23 | 24 | find_path(PROTOBUF_INCLUDE_DIR NAMES "google/protobuf/stubs/common.h" ) 25 | find_library(PROTOBUF_LIBRARIES NAMES protobuf) 26 | include(FindPackageHandleStandardArgs) 27 | find_package_handle_standard_args(Protobuf 28 | REQUIRED_VARS PROTOBUF_EXECUTABLE PROTOBUF_VERSION PROTOBUF_LIBRARIES PROTOBUF_INCLUDE_DIR 29 | VERSION_VAR PROTOBUF_VERSION) 30 | -------------------------------------------------------------------------------- /cmake/doxygen.cmake: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | find_package(Doxygen QUIET) 6 | if(DOXYGEN_FOUND) 7 | set(DOXYCONF_IN ${CMAKE_SOURCE_DIR}/doxygen.in.conf) 8 | set(DOXYCONF_OUT ${CMAKE_BINARY_DIR}/doxygen.conf) 9 | if(EXISTS ${DOXYCONF_IN}) 10 | configure_file(${DOXYCONF_IN} ${DOXYCONF_OUT}) 11 | else() 12 | file(WRITE ${DOXYCONF_OUT} " 13 | PROJECT_NAME = \"${PROJECT_NAME}\" 14 | PROJECT_BRIEF = \"${CPACK_PACKAGE_DESCRIPTION_SUMMARY}\" 15 | OUTPUT_DIRECTORY = docs 16 | GENERATE_LATEX = NO 17 | GENERATE_TODOLIST = YES 18 | FULL_PATH_NAMES = YES 19 | STRIP_FROM_PATH = \"${CMAKE_SOURCE_DIR}\" 20 | SOURCE_BROWSER = YES 21 | TAB_SIZE = 4 22 | EXTRACT_ALL = YES 23 | JAVADOC_AUTOBRIEF = YES 24 | RECURSIVE = YES 25 | INPUT = \"${CMAKE_SOURCE_DIR}\" 26 | EXCLUDE_PATTERNS = \"${CMAKE_SOURCE_DIR}/.*\" \"${CMAKE_SOURCE_DIR}/debug*\" \"${CMAKE_SOURCE_DIR}/release*\" 27 | EXAMPLE_PATH = ${EXAMPLE_PATHS} 28 | IMAGE_PATH = ${IMAGE_PATHS} 29 | BUILTIN_STL_SUPPORT = YES 30 | STRIP_CODE_COMMENTS = NO 31 | SHOW_DIRECTORIES = YES 32 | PROJECT_NUMBER = ${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") 33 | endif() 34 | 35 | add_custom_target(docs ${DOXYGEN_EXECUTABLE} ${DOXYCONF_OUT}) 36 | else() 37 | message("Doxygen was not found, the documentation pages will not be generated") 38 | endif() 39 | -------------------------------------------------------------------------------- /cmake/externals.cmake: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | include(ExternalProject) 6 | set_property(DIRECTORY PROPERTY EP_BASE "${CMAKE_BINARY_DIR}/ep_base") 7 | find_program(PATCH_EXECUTABLE patch) 8 | if (NOT PATCH_EXECUTABLE) 9 | message(FATAL_ERROR "patch not found") 10 | endif() 11 | 12 | externalproject_add( 13 | rapidjson-0_11 14 | URL http://rapidjson.googlecode.com/files/rapidjson-0.11.zip 15 | URL_MD5 96a4b1b57ece8bc6a807ceb14ccaaf94 16 | PATCH_COMMAND ${PATCH_EXECUTABLE} -p1 < ${CMAKE_CURRENT_LIST_DIR}/rapidjson-0_11.patch 17 | CONFIGURE_COMMAND "" 18 | BUILD_COMMAND "" 19 | INSTALL_COMMAND "" 20 | ) 21 | 22 | set(RAPIDJSON_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/ep_base/Source/rapidjson-0_11/include") 23 | include_directories(${RAPIDJSON_INCLUDE_DIRS}) 24 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -isystem ${RAPIDJSON_INCLUDE_DIRS}") 25 | 26 | -------------------------------------------------------------------------------- /cmake/mozsvc.cmake: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | if(MSVC) 6 | # Predefined Macros: http://msdn.microsoft.com/en-us/library/b0084kay.aspx 7 | # Compiler options: http://msdn.microsoft.com/en-us/library/fwkeyyhe.aspx 8 | 9 | # set a high warning level and treat them as errors 10 | set(CMAKE_C_FLAGS "/W3 /WX") 11 | 12 | # enable C++ exception handling 13 | set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} /EHsc") 14 | 15 | # debug multi threaded dll runtime, complete debugging info, runtime error checking 16 | set(CMAKE_C_FLAGS_DEBUG "/MDd /Zi /RTC1") 17 | set(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) 18 | 19 | # multi threaded dll runtime, optimize for speed, auto inlining 20 | set(CMAKE_C_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG") 21 | set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) 22 | 23 | set(CPACK_GENERATOR "NSIS") 24 | else() 25 | # Predefined Macros: clang|gcc -dM -E -x c /dev/null 26 | # Compiler options: http://gcc.gnu.org/onlinedocs/gcc/Invoking-GCC.html#Invoking-GCC 27 | set(CMAKE_C_FLAGS "-std=c11 -pedantic -Werror -Wno-error=deprecated -Wall -Wextra -fPIC") 28 | set(CMAKE_CXX_FLAGS "-std=c++11 -pedantic -Werror -Wno-error=deprecated -Wall -Wextra -fPIC -isystem /usr/local/include -isystem /opt/local/include") 29 | set(CMAKE_C_FLAGS_DEBUG "-g") 30 | set(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) 31 | 32 | set(CMAKE_C_FLAGS_RELEASE "-O2 -DNDEBUG") 33 | set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) 34 | 35 | set(CMAKE_C_FLAGS_PROFILE "${CMAKE_C_FLAGS_RELEASE} -g -pg") 36 | set(CMAKE_CXX_FLAGS_PROFILE ${CMAKE_C_FLAGS_PROFILE}) 37 | 38 | set(CPACK_GENERATOR "TGZ") 39 | 40 | set(CMAKE_SKIP_BUILD_RPATH FALSE) 41 | set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) 42 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE) 43 | endif() 44 | 45 | set(CPACK_PACKAGE_VENDOR "Mozilla Services") 46 | set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/LICENSE") 47 | include(CPack) 48 | include(CTest) 49 | 50 | set(Boost_USE_STATIC_LIBS ON) 51 | set(Boost_USE_MULTITHREADED ON) 52 | set(Boost_USE_STATIC_RUNTIME OFF) 53 | 54 | include(doxygen) 55 | -------------------------------------------------------------------------------- /docs/BagheeraIntegration.md: -------------------------------------------------------------------------------- 1 | Bagheera Integration 2 | ==================== 3 | 4 | Production Telemetry Data is submitted to a [Bagheera][1] server. 5 | 6 | Bagheera is a REST service that accepts submissions via HTTP. 7 | 8 | As of version 0.11, submissions are first saved to a Kafka queue, which is then 9 | processed by one or more Kafka consumers. 10 | 11 | As a preliminary way to integrate the prototype telemetry server with the 12 | existing pipeline, we use the [KafkaReplayConsumer][2] to replay the 13 | production requests against the prototype server. 14 | 15 | This results in no data loss of production data, and an optionally-sampled 16 | stream of data being directed to a second server. 17 | 18 | The simple approach ro running such a replay consumer would be to use the 19 | packaged `consumer` script distributed with [Bagheera][1] with a command like: 20 | 21 | ```bash 22 | # Set variables 23 | export BAGHEERA_HOME=/path/to/bagheera 24 | export KAFKA_TOPIC=my_topic 25 | export KAFKA_GID=replay_${KAFKA_TOPIC}_20130624 # ensure the gid is unique to this consumer! 26 | export REPLAY_HOST=www.example.com 27 | export SAMPLE_RATE=0.01 # use '1' to replay all requests, or a float less than one to sample. 28 | 29 | # Run the command 30 | sudo -u bagheera_user $BAGHEERA_HOME/bin/consumer com.mozilla.bagheera.consumer.KafkaReplayConsumer \ 31 | -t $KAFKA_TOPIC \ 32 | -gid $KAFKA_GID \ 33 | -p $BAGHEERA_HOME/conf/kafka.consumer.properties \ 34 | --copy-keys true \ 35 | --dest "http://$REPLAY_HOST/submit/telemetry/%k" \ 36 | --sample $SAMPLE_RATE \ 37 | --delete false 38 | ``` 39 | 40 | In the case where your network security does not allow outbound HTTP requests, 41 | you may need to specify an HTTP proxy. This can easily be done at the JVM 42 | level, so we can invoke the full command manually. We end up with: 43 | 44 | ```bash 45 | export PROXY_HOST=example.proxy.mozilla.com 46 | export PROXY_PORT=9999 47 | sudo -u bagheera_user java \ 48 | -Dhttp.proxyHost=$PROXY_HOST \ 49 | -Dhttp.proxyPort=$PROXY_PORT \ 50 | ...... \ 51 | -cp \ 52 | com.mozilla.bagheera.consumer.KafkaReplayConsumer \ 53 | -t $KAFKA_TOPIC \ 54 | -gid $KAFKA_GID \ 55 | -p $BAGHEERA_HOME/conf/kafka.consumer.properties \ 56 | --copy-keys true \ 57 | --dest "http://$REPLAY_HOST/submit/telemetry/%k" \ 58 | --sample $SAMPLE_RATE \ 59 | --delete false 60 | ``` 61 | 62 | [1]: https://github.com/mozilla-metrics/bagheera "Bagheera" 63 | [2]: https://github.com/mozilla-metrics/bagheera/blob/master/src/main/java/com/mozilla/bagheera/consumer/KafkaReplayConsumer.java "KafkaReplayConsumer" 64 | -------------------------------------------------------------------------------- /docs/CompressionBenchmarks.md: -------------------------------------------------------------------------------- 1 | Compression Benchmarks 2 | ====================== 3 | 4 | One of the important considerations here is what compression format 5 | to use, and within that format, what level of compression to use. 6 | 7 | After a round of testing that is largely lost to the sands of time, 8 | we settled on the LZMA / XZ format. 9 | 10 | Some statistics on the time vs. space characteristics of various 11 | compression levels can be found at [compression notes][1]. 12 | 13 | To run a real-world test, you can use some code like the following: 14 | 15 | aws s3 cp s3://telemetry-published-bucket/path/to/sample_file.lzma ./ 16 | lzma -d sample_file.lzma 17 | for level in $(seq 0 9); do 18 | echo "compressing with level $level" 19 | time cat sample_file | lzma -${level} > test$level.lzma 20 | ls -l test$level.lzma 21 | done &>> comptest.log 22 | 23 | Using a ~500MB raw input file on a `c3.large` EC2 node, this gives 24 | a result like: 25 | 26 | 27 | Level Time Size Filename 28 | ------- --------- --------- ---------- 29 | level 0 0m26.176s 105830359 test0.lzma 30 | level 1 0m28.231s 89387336 test1.lzma 31 | level 2 0m37.868s 81364589 test2.lzma 32 | level 3 0m52.852s 76801476 test3.lzma 33 | level 4 1m40.807s 73784033 test4.lzma 34 | level 5 2m36.868s 65191241 test5.lzma 35 | level 6 3m39.400s 61367748 test6.lzma 36 | level 7 4m1.284s 60218864 test7.lzma 37 | level 8 4m19.748s 59183316 test8.lzma 38 | level 9 4m47.116s 58338421 test9.lzma 39 | 40 | 41 | Using `xz` instead of `lzma` gives nearly identical numbers, but `xz` is to be 42 | preferred since those files can be concatenated without having to decompress 43 | and compress again. 44 | 45 | [1]: https://docs.google.com/spreadsheet/pub?key=0AoRU282jPz57dFBuX0pZX25NNVRlU3lQTDZUVzlEUEE&output=html 46 | -------------------------------------------------------------------------------- /docs/ProcessIncoming.md: -------------------------------------------------------------------------------- 1 | Architecture for Processing Incoming Data 2 | ========================================= 3 | 4 | Let `N` denote the number of CPU cores available on the processing machine. 5 | 6 | The `Server` Process: 7 | --------------------- 8 | * Create following folders on same storage device 9 | * `downloading/` for files while being downloaded, 10 | * `upload/` for files ready for upload 11 | * `uploading/` for files being uploaded 12 | * `incoming/` for incoming files that have been downloaded 13 | * Start `N / 2` instances of the `Download` process 14 | * Start `N / 2` instances of the `Upload` process 15 | * For `i = 0` to `N` do: 16 | * Create folders `work-i/`, `input-i/`, `log-i/` 17 | * Start `Worker` process (given it a reference to `i`) 18 | 19 | The `Download` Process: 20 | ----------------------- 21 | * While number of files in the `incoming` folder is less than `N`. 22 | * Download a new raw telemetry log file to `downloading/` 23 | * Move downloaded file from `downloading/` to `incoming/` 24 | 25 | The `Upload` Process: 26 | --------------------- 27 | * While the `upload/` contains files: 28 | * Move a file from `upload/` to `uploading/` 29 | * Upload file to S3 30 | * Delete from file from `uploading/` 31 | 32 | The `Worker` Process i: 33 | ----------------------- 34 | * While `incoming` is non-empty: 35 | * Move file from `incoming/` to `input-i/` 36 | * For each line in file: 37 | * Parse line giving us path and histogram 38 | * if parse error 39 | * Write to somewhere in `log-i/` 40 | * Skip line 41 | * Convert histogram 42 | * RecordWriter.write(path, historgram.serilize()) 43 | * Delete input file 44 | * If SIGHUP has been seen: 45 | * close all files and compressor context in HashTable 46 | * Compress files and move them to `upload/` 47 | * On SIGHUP: Raise a boolean flag. 48 | 49 | 50 | The idea with WorkerProcess: 51 | ---------------------------- 52 | * We can stop it at anytime and upload (by sending it a SIGHUP) 53 | * We can keep it running and feed it data until it produces big files (worst 54 | case, one file per day for a given set of partitions) 55 | * We can tweak number of compression contexts, reducing intermediate disk I/O 56 | in exchange for increased memory usage 57 | * If we crash, uncompressed files from `work-i/` can be compressed and uploaded 58 | * If we crash, offending `incoming` file is located in `input-i/` this can be 59 | uploaded for tests (not for reapplication if we do previous thing) 60 | * Both conversion and compression happens in WorkerProcess, so we can't fill up 61 | a pipe somewhere and have IPC problems 62 | * Problem with conversion and compression in same process if that if conversion 63 | crashes, partially compressed data is corrupt 64 | -------------------------------------------------------------------------------- /docs/data_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/docs/data_flow.png -------------------------------------------------------------------------------- /http/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/http/__init__.py -------------------------------------------------------------------------------- /http/analysis-service/config.py: -------------------------------------------------------------------------------- 1 | SECRET_KEY = 'Overwrite with a secret on deployment' 2 | 3 | # AWS EC2 configuration 4 | AWS_REGION = 'us-west-2' 5 | INSTANCE_TYPE = 'c3.4xlarge' 6 | WORKER_AMI = 'ami-0057b733' # -> telemetry-worker-hvm-20151019 (Ubuntu 15.04) 7 | WORKER_PRIVATE_PROFILE = 'telemetry-example-profile' 8 | WORKER_PUBLIC_PROFILE = 'telemetry-example-profile' 9 | 10 | # EMR configuration 11 | # Master and slave instance types should be the same as the telemetry 12 | # setup bootstrap action depends on it to autotune the cluster. 13 | MASTER_INSTANCE_TYPE = INSTANCE_TYPE 14 | SLAVE_INSTANCE_TYPE = INSTANCE_TYPE 15 | EMR_RELEASE = 'emr-4.5.0' 16 | SPARK_INSTANCE_PROFILE = 'telemetry-example-profile' 17 | SPARK_EMR_BUCKET = 'example' 18 | 19 | # Make sure the ephemeral map matches the instance type above. 20 | EPHEMERAL_MAP = { "/dev/xvdb": "ephemeral0", "/dev/xvdc": "ephemeral1" } 21 | SECURITY_GROUPS = [] 22 | INSTANCE_PROFILE = 'telemetry-analysis-profile' 23 | INSTANCE_APP_TAG = 'telemetry-analysis-worker-instance' 24 | EMAIL_SOURCE = 'telemetry-alerts@mozilla.com' 25 | 26 | # Tags for accounting purposes 27 | ACCOUNTING_APP_TAG = 'telemetry-analysis' 28 | ACCOUNTING_TYPE_TAG = 'worker' 29 | 30 | # Buckets for storing S3 data 31 | TEMPORARY_BUCKET = 'bucket-for-ssh-keys' 32 | CODE_BUCKET = 'telemetry-analysis-code-2' 33 | PUBLIC_DATA_BUCKET = 'telemetry-public-analysis-2' 34 | PRIVATE_DATA_BUCKET = 'telemetry-private-analysis-2' 35 | -------------------------------------------------------------------------------- /http/analysis-service/makefile: -------------------------------------------------------------------------------- 1 | CFYAML = ../../telemetry/util/cf-yaml-helper.py 2 | 3 | SOURCES_BUCKET = telemetry-analysis-code-2 4 | VERSION = 21 5 | 6 | FILES = $(shell find * -name "*.py") \ 7 | $(shell find * -name "*.sh") \ 8 | $(shell find * -name "*.css") \ 9 | $(shell find * -name "*.png") \ 10 | $(shell find * -name "*.html") 11 | 12 | telemetry-analysis-service.tar.gz: $(FILES) 13 | tar -czf $@ $^ 14 | 15 | analysis-service-stack.json: analysis-service-stack.yaml 16 | $(CFYAML) $< > $@ 17 | 18 | analysis-resources.json: analysis-resources.yaml 19 | $(CFYAML) $< > $@ 20 | 21 | put: telemetry-analysis-service.tar.gz analysis-service-stack.json 22 | aws s3 cp telemetry-analysis-service.tar.gz s3://$(SOURCES_BUCKET)/v$(VERSION)/telemetry-analysis-service.tar.gz 23 | aws s3 cp analysis-service-stack.json s3://$(SOURCES_BUCKET)/v$(VERSION)/analysis-service-stack.json 24 | 25 | clean: 26 | rm -f telemetry-analysis-service.tar.gz analysis-service-stack.json 27 | 28 | .PHONY: put 29 | -------------------------------------------------------------------------------- /http/analysis-service/requirements: -------------------------------------------------------------------------------- 1 | boto==2.38.0 2 | botocore==1.3.9 3 | docutils==0.12 4 | Flask==0.10.1 5 | Flask-BrowserID==0.0.4 6 | Flask-Login==0.3.2 7 | futures==2.2.0 8 | itsdangerous==0.24 9 | Jinja2==2.8 10 | jmespath==0.9.0 11 | MarkupSafe==0.23 12 | python-dateutil==2.4.2 13 | requests==2.8.1 14 | six==1.10.0 15 | SQLAlchemy==1.0.9 16 | Werkzeug==0.11.2 17 | -------------------------------------------------------------------------------- /http/analysis-service/static/cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/http/analysis-service/static/cluster.png -------------------------------------------------------------------------------- /http/analysis-service/static/schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/http/analysis-service/static/schedule.png -------------------------------------------------------------------------------- /http/analysis-service/static/style.css: -------------------------------------------------------------------------------- 1 | /* This Source Code Form is subject to the terms of the Mozilla Public 2 | * License, v. 2.0. If a copy of the MPL was not distributed with this 3 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 | 5 | img { 6 | vertical-align: middle; 7 | } 8 | 9 | table { 10 | border-collapse: collapse; 11 | } 12 | 13 | tr:nth-child(odd) { 14 | background-color: #eee; 15 | } 16 | 17 | td { 18 | padding-right: 10px; 19 | } 20 | 21 | td.field-desc { 22 | font-size: 85%; 23 | color: #555; 24 | text-align: left; 25 | } 26 | 27 | td.field-input { 28 | text-align: left; 29 | margin: 10px; 30 | padding: 10px; 31 | } 32 | 33 | td.field-label { 34 | min-width: 15%; 35 | font-weight: bold; 36 | text-align: right; 37 | } 38 | 39 | label.error { 40 | font-size: 85%; 41 | color: red; 42 | text-align: left; 43 | } 44 | 45 | div.field-error { 46 | font-size: 85%; 47 | color: red; 48 | text-align: left; 49 | border: 1px dotted red; 50 | } 51 | 52 | body { 53 | /* Padding to leave room for header and footer */ 54 | padding-top: 50px; 55 | padding-bottom: 80px; 56 | } 57 | 58 | .button-margin { 59 | margin-top: 10px; 60 | } 61 | 62 | html { 63 | position: relative; 64 | min-height: 100%; 65 | } 66 | 67 | #footer { 68 | position: absolute; 69 | bottom: 0; 70 | width: 100%; 71 | /* Set the fixed height of the footer here */ 72 | height: 60px; 73 | background-color: #eee; 74 | } 75 | 76 | .no-select { 77 | -webkit-touch-callout: none; 78 | -webkit-user-select: none; 79 | -khtml-user-select: none; 80 | -moz-user-select: none; 81 | -ms-user-select: none; 82 | user-select: none; 83 | } 84 | -------------------------------------------------------------------------------- /http/analysis-service/static/worker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/http/analysis-service/static/worker.png -------------------------------------------------------------------------------- /http/analysis-service/templates/boot-script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /home/ubuntu 4 | 5 | # Install a few dependencies 6 | install() 7 | { 8 | apt-get update 9 | DEBIAN_FRONTEND=noninteractive apt-get -y \ 10 | -o DPkg::Options::=--force-confdef \ 11 | -o DPkg::Options::=--force-confold \ 12 | install $@ 13 | } 14 | install xz-utils python-pip git python-dev ntp python-protobuf python-snappy 15 | pip install --upgrade boto awscli simplejson 16 | 17 | # Get users ssh key 18 | python - << END 19 | from boto.s3 import connect_to_region 20 | s3 = connect_to_region('{{ aws_region }}') 21 | b = s3.get_bucket('{{ temporary_bucket }}', validate = False) 22 | k = b.get_key('{{ ssh_key }}') 23 | k.get_contents_to_filename('/home/ubuntu/user_key.pub') 24 | END 25 | 26 | {% if ephemeral_map %} 27 | # RAID0 Configuration: 28 | {% set raid_devices = ephemeral_map.keys()|sort %} 29 | {% set device_list = " ".join(raid_devices) %} 30 | install mdadm xfsprogs 31 | umount /mnt 32 | yes | mdadm --create /dev/md0 --level=0 -c64 --raid-devices={{ raid_devices|length }} {{ device_list }} 33 | echo 'DEVICE {{ device_list }}' >> /etc/mdadm/mdadm.conf 34 | mdadm --detail --scan >> /etc/mdadm/mdadm.conf 35 | mkfs.xfs /dev/md0 36 | mount /dev/md0 /mnt 37 | {% endif %} 38 | 39 | # Setup users ssh_key 40 | cat /home/ubuntu/user_key.pub >> /home/ubuntu/.ssh/authorized_keys 41 | chmod 600 /home/ubuntu/.ssh/authorized_keys 42 | 43 | # Set the default AWS region 44 | if [ ! -d /home/ubuntu/.aws ]; then 45 | sudo -u ubuntu mkdir /home/ubuntu/.aws 46 | fi 47 | if [ ! -f /home/ubuntu/.aws/config ]; then 48 | sudo -u ubuntu echo -e "[default]\nregion = {{ aws_region }}" > /home/ubuntu/.aws/config 49 | fi 50 | 51 | # Make telemetry work dir 52 | if [ ! -d /mnt/telemetry ]; then 53 | mkdir /mnt/telemetry 54 | fi 55 | chown ubuntu:ubuntu /mnt/telemetry 56 | 57 | # Setup the motd 58 | sudo cat >/etc/motd <Launch an ad-hoc Spark cluster 8 |

9 | 10 | Launch a Spark cluster in the cloud and use it for custom data analysis.

11 |

12 | The cluster will be available for 24 hours, then it will be automatically 13 | terminated. 14 |

15 |

16 | For a guide of how to do data analysis using Spark, check out 17 | 18 | :rvitillo's blog post on the topic. 19 |

20 | {% endblock %} 21 | {% block content %} 22 |

Launch a cluster:

23 |
24 | 25 | 26 | {% set username = current_user.email.split('@')[0] %} 27 | 28 | {% call macros.make_input('name', 'Cluster Name', 'text', username + '-telemetry-analysis') %} 29 | The cluster name identifies this cluster within AWS. Use something short 30 | like "{{ username }}-charset-usage" 31 | {% endcall %} 32 | 33 | {% call macros.make_input('num_workers', 'Cluster Size', value='1') %} 34 | The number of workers for this cluster. Please keep in mind to use resources 35 | sparingly. Use a single worker to write and debug your job. 36 | {% endcall %} 37 | 38 | {% call macros.make_input('public-ssh-key', 'SSH Public Key', type='file') %} 39 | Your public key file. Usually id_rsa.pub. This will be appended 40 | to the server's authorized_keys to grant you SSH access. Do 41 | not upload a private key file. 42 | {% endcall %} 43 |
44 | 45 |
46 | {% endblock %} 47 | -------------------------------------------------------------------------------- /http/analysis-service/templates/cluster/email.html: -------------------------------------------------------------------------------- 1 | Hi, 2 |
3 |
4 | We've launched an EMR cluster with access to telemetry published data, at your 5 | request. As the cluster powers up you can:
6 |
    7 |
  • monitor the cluster status,
  • 8 |
  • find the public DNS, and
  • 9 |
  • terminate the cluster,
  • 10 |
11 | here:
12 |
13 | {{ monitoring_url }} 14 |
15 |
16 | Please, be sure to kill your cluster when you're done with it. 17 | 18 | -------------------------------------------------------------------------------- /http/analysis-service/templates/cluster/kill.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block cluster_active %} class="active" {% endblock %} 4 | 5 | {% block title %} 6 |

Your cluster is now dying, and will soon be dead.

7 | {% endblock %} 8 | 9 | {% block content %} 10 |

We've requested that your cluster ({{ jobflow_id }}) be killed.

11 | 12 |
13 |
ID:
14 |
{{ jobflow_id }}
15 |
16 |
17 |
State:
18 |
{{ jobflow_state }}
19 |
20 |
21 | {% endblock %} 22 | 23 | -------------------------------------------------------------------------------- /http/analysis-service/templates/cluster/schedule_create.html: -------------------------------------------------------------------------------- 1 | {% extends "cluster/schedule.html" %} 2 | {% block content %} 3 |

Success!

4 | 5 |

Your code has been uploaded to {{ code_s3path }}.

6 |

Any output files found in relative to where the notebook will be execute will 7 | be published at {{ data_s3path }}. The output files will overwrite 8 | anything already in that location in S3.

9 |

The job will be run {{ job_frequency }} at 10 | {{ job_time }}{{ job_dow }}{{ job_dom }}.

11 |

The job will be allowed to run for a max of 12 | {{ job_timeout }} minutes, after which it will be killed. 13 | 14 |

15 | Cron spec will be {{ cron_spec }} 16 |

17 |

18 | Go back 19 |

20 | {% endblock %} 21 | -------------------------------------------------------------------------------- /http/analysis-service/templates/cluster/schedule_delete.html: -------------------------------------------------------------------------------- 1 | {% extends "cluster/schedule.html" %} 2 | {% block title %} 3 |

Delete a scheduled Spark job

4 | {% endblock %} 5 | {% block content %} 6 |

7 | Job {{ job.name }} has been deleted (deleted 8 | {{ result.rowcount }} record{% if result.rowcount > 1 %}s{% endif %}). 9 |

10 |

11 | Go back 12 |

13 | {% endblock %} 14 | -------------------------------------------------------------------------------- /http/analysis-service/templates/cluster/schedule_files.html: -------------------------------------------------------------------------------- 1 | {% import 'macros.html' as macros with context %} 2 | {% extends "base.html" %} 3 | 4 | {% block cluster_schedule_active %} class="active" {% endblock %} 5 | 6 | {% block title %} 7 |

View files for a scheduled Spark job

8 | {% endblock %} 9 | 10 | {% block content %} 11 | {% if files %} 12 |
13 |

14 | View {{ name }} output for job {{ job.id }}: {{ job.name }} below. 15 |

16 | {% for f in files %} 17 |
18 | {% if f.url.endswith(".ipynb") %} 19 |

{{ f.title|default(f.url) }}

20 | {% else %} 21 |

{{ f.title|default(f.url) }}

22 | {% endif %} 23 |
24 | {% endfor %} 25 |
26 |
27 | {% else %} 28 |

29 | There is no {{ name }} output for job {{ job.name }} yet. 30 | Either it hasn't successfully run, or it has always timed out. 31 | Current time limit is {{ job.timeout_minutes }} minutes. 32 |

33 | {% endif %} 34 |

35 | Go back 36 |

37 | {% endblock %} 38 | -------------------------------------------------------------------------------- /http/analysis-service/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block home_active %} class="active" {% endblock %} 3 | {% block content %} 4 |
5 |
6 |

Launch a Spark Cluster

7 |

Launch a Spark cluster in the cloud and use it for custom data analysis. 8 | The cluster will be available for 24 hours, then it will be automatically 9 | terminated.

10 |

11 | 12 | 13 | Launch an ad-hoc Spark cluster 14 | 15 |

16 |
17 |
18 |

Schedule a Spark Job

19 |

Run a Spark analysis on a scheduled basis. The output of the analysis 20 | will be published in Amazon S3.

21 |

22 | 23 | 24 | Schedule a Spark analysis job 25 | 26 |

27 |
28 |
29 |
30 |
31 |

Launch a Worker

32 |

Launch a server in the cloud and use it for custom map-reduce data analysis. 33 | The machine will be available for 24 hours, then it will be automatically 34 | terminated.

35 |

36 | 37 | 38 | Launch an ad-hoc analysis worker 39 | 40 |

41 |
42 |
43 |

Schedule a Job

44 |

Run a map-reduce telemetry analysis on a scheduled basis. The output of the analysis 45 | will be published in Amazon S3.

46 |

47 | 48 | 49 | Schedule an analysis job 50 | 51 |

52 |
53 |
54 | {% endblock %} 55 | -------------------------------------------------------------------------------- /http/analysis-service/templates/instance-launched-email.html: -------------------------------------------------------------------------------- 1 | Hi,
2 | We've launched an EC2 instance with access to telemetry published data, at your 3 | request. As the instance powers up you can:
4 |
    5 |
  • Monitor instance status,
  • 6 |
  • Find public DNS, and
  • 7 |
  • Terminate the instance,
  • 8 |
9 | here: {{ monitoring_url }}
10 |
11 | Please, be sure to kill your instance when you're done with it. -------------------------------------------------------------------------------- /http/analysis-service/templates/kill.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block worker_active %} class="active" {% endblock %} 4 | 5 | {% block title %} 6 |

Your instance is now dying, and will soon be dead.

7 | {% endblock %} 8 | 9 | {% block content %} 10 |

We've requested that your instance ({{ instance_id }}) be killed.

11 | 12 |
13 |
ID:
14 |
{{ instance_id }}
15 |
16 |
17 |
State:
18 |
{{ instance_state }}
19 |
20 |
21 |

22 | You can still monitor it just to 23 | make sure it terminates. 24 |

25 | {% endblock %} 26 | 27 | -------------------------------------------------------------------------------- /http/analysis-service/templates/macros.html: -------------------------------------------------------------------------------- 1 | {% macro make_input(name, label, type='text', value=None, required=True) -%} 2 | 3 | 4 | 5 | {% if errors and name in errors %} 6 |
7 | {{ errors[name] }} 8 | {% endif %} 9 | 18 | {% if errors and name in errors %} 19 |
20 | {% endif %} 21 | 22 | 23 | {{ caller() }} 24 | 25 | 26 | {%- endmacro %} 27 | 28 | {% macro begin_select(errors, name, label) -%} 29 | 30 | 31 | 32 | {% if errors and name in errors %} 33 |
34 | {{ errors[name] }} 35 | {% endif %} 36 | 50 | 51 | 52 | {%- endmacro %} 53 | 54 | {% macro end_select() -%} 55 | 56 | 57 | {%- endmacro %} 58 | -------------------------------------------------------------------------------- /http/analysis-service/templates/schedule_create.html: -------------------------------------------------------------------------------- 1 | {% extends "schedule.html" %} 2 | {% block content %} 3 |

Success!

4 | 5 |

Your code has been uploaded to {{ code_s3path }}.

6 |

This tarball will be unpacked on a worker node, then {{ commandline }} 7 | will be invoked.

8 |

Any output files found in {{ output_dir }} (relative to where 9 | the tarball was unpacked) will be published at {{ data_s3path }}. 10 | The output files will overwrite anything already in that location in S3.

11 |

The job will be run {{ job_frequency }} at 12 | {{ job_time }}{{ job_dow }}{{ job_dom }}.

13 |

The job will be allowed to run for a max of 14 | {{ job_timeout }} minutes, after which it will be killed. 15 | 16 |

17 | Cron spec will be {{ cron_spec }} 18 |

19 |

20 | Go back 21 |

22 | {% endblock %} 23 | -------------------------------------------------------------------------------- /http/analysis-service/templates/schedule_delete.html: -------------------------------------------------------------------------------- 1 | {% extends "schedule.html" %} 2 | {% block title %} 3 |

Delete a scheduled analysis job

4 | {% endblock %} 5 | {% block content %} 6 |

7 | Job {{ job.name }} has been deleted (deleted 8 | {{ result.rowcount }} record{% if result.rowcount > 1 %}s{% endif %}). 9 |

10 |

11 | Go back 12 |

13 | {% endblock %} 14 | -------------------------------------------------------------------------------- /http/analysis-service/templates/schedule_files.html: -------------------------------------------------------------------------------- 1 | {% import 'macros.html' as macros with context %} 2 | {% extends "base.html" %} 3 | 4 | {% block schedule_active %} class="active" {% endblock %} 5 | 6 | {% block title %} 7 |

View files for a scheduled job

8 | {% endblock %} 9 | 10 | {% block content %} 11 | {% if files %} 12 |
13 |

14 | View {{ name }} output for job {{ job.id }}: {{ job.name }} below. 15 |

16 | {% for f in files %} 17 | 20 | {% endfor %} 21 |
22 |
23 | {% else %} 24 |

25 | There is no {{ name }} output for job {{ job.name }} yet. 26 | Either it hasn't successfully run, or it has always timed out. 27 | Current time limit is {{ job.timeout_minutes }} minutes. 28 |

29 | {% endif %} 30 | {% endblock %} 31 | -------------------------------------------------------------------------------- /http/analysis-service/templates/worker.html: -------------------------------------------------------------------------------- 1 | {% import 'macros.html' as macros with context %} 2 | {% extends "base.html" %} 3 | 4 | {% block worker_active %} class="active" {% endblock %} 5 | 6 | {% block title %} 7 |

Launch an ad-hoc analysis worker

8 |

9 | 10 | Launch a server in the cloud and use it for custom data analysis.

11 |

12 | The machine will be available for 24 hours, then it will be automatically 13 | terminated. 14 |

15 |

16 | For more info on how to use run a custom analysis, check out 17 | 18 | :mreid's blog post on the topic. 19 |

20 |

21 | Or you can read the docs for the 22 | 23 | Telemetry MapReduce framework. 24 |

25 | {% endblock %} 26 | {% block content %} 27 |

Launch a worker:

28 |
29 | 30 | 31 | {% set username = current_user.email.split('@')[0] %} 32 | {% call macros.make_input('name', 'Server Name', 'text', username + '-telemetry-analysis') %} 33 | The server name identifies this machine within AWS. Use something short 34 | like "{{ username }}-charset-usage" 35 | {% endcall %} 36 | 37 | {% call macros.make_input('public-ssh-key', 'SSH Public Key', type='file') %} 38 | Your public key file. Usually id_rsa.pub. This will be appended 39 | to the server's authorized_keys to grant you SSH access. Do 40 | not upload a private key file. 41 | {% endcall %} 42 |
43 | 44 |
45 | {% endblock %} 46 | -------------------------------------------------------------------------------- /http/analysis-service/terminate-expired-instances.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | def main(): 4 | print "ATMO v1 is no longer in charge of terminating expired clusters." 5 | 6 | if __name__ == '__main__': 7 | main() 8 | -------------------------------------------------------------------------------- /http/analysis-service/user.py: -------------------------------------------------------------------------------- 1 | from flask.ext.login import UserMixin, AnonymousUserMixin 2 | 3 | class User(UserMixin): 4 | def __init__(self, email): 5 | self.email = email 6 | 7 | def is_authenticated(self): 8 | return self.email != None 9 | 10 | def is_authorized(self): 11 | return self.email.endswith('mozilla.com') or self.email.endswith('mozilla.org') 12 | 13 | def is_active(self): 14 | return self.email != None 15 | 16 | def is_anonymous(self): 17 | return self.email == None 18 | 19 | def get_id(self): 20 | return self.email 21 | 22 | class AnonymousUser(AnonymousUserMixin): 23 | def is_authorized(self): 24 | return False -------------------------------------------------------------------------------- /http/server_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "motd": " ==== Telemetry Server. Accepting Submissions since 2013. ====", 3 | "max_data_length": 204800, 4 | "max_path_length": 10240, 5 | "port": 8080, 6 | "log_path": "/mnt/telemetry/data", 7 | "max_log_age_ms": 300000, 8 | "max_log_size": 524288000, 9 | "include_request_ip": true, 10 | "stats_log_file": "/var/log/telemetry/telemetry-server.log" 11 | } 12 | -------------------------------------------------------------------------------- /mapreduce/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/mapreduce/__init__.py -------------------------------------------------------------------------------- /mapreduce/addon_perf/README.md: -------------------------------------------------------------------------------- 1 | Telemetry map/reduce to analyze bootstrap add-on performance probes 2 | =================================================================== 3 | 4 | Files: 5 | 6 | - addon_perf.py: Telemetry map/reduce to crunch raw data into lines describing: 7 | - exceptions caught during add-on manager initialization 8 | - histogram of time taken by add-on file scans and bootstrap methods 9 | 10 | - combine.py: merge outputs from am_exceptions.py and generate .csv format summaries 11 | - weekly-addons-{date}.csv.gz 12 | - weekly-exceptions-{date}.csv.ga 13 | 14 | - run.sh: driver script for Telemetry scheduled daily job - downloads actual M/R code 15 | from Github and executes job 16 | 17 | - processAddonPerf.py: analysis script that runs the telemetry M/R job using addon_perf.py 18 | and then produces the output files by gathering the week's data from S3 and running combine.py 19 | 20 | - filter_template.json: template for M/R job filter; processExceptions.py creates a copy 21 | with the desired date for each M/R run 22 | -------------------------------------------------------------------------------- /mapreduce/addon_perf/addon-perf.json: -------------------------------------------------------------------------------- 1 | { 2 | "sort-options": { 3 | "values": ["Impact", "Popularity", "Median (ms)", "75% (ms)"], 4 | "selected": "Impact" 5 | }, 6 | "filter-options": [ 7 | {"id": "Application", 8 | "values": ["", "Firefox", "Fennec"], 9 | "selected": ""}, 10 | {"id": "Platform", 11 | "values": ["", "WINNT", "Linux", "Darwin", "Android"], 12 | "selected": ""}, 13 | {"id": "Measure", 14 | "values": ["", "startup_MS", "shutdown_MS"], 15 | "selected": ""}, 16 | {"id": "Limit", 17 | "values": [10, 20, 50], 18 | "selected": 10} 19 | ], 20 | "title": ["Telemetry Add-on Performance", "Bootstrap add-on start up and shut down times"], 21 | "primary-key": ["Application", "Platform", "Addon ID", "Version", "Measure"], 22 | "header": ["Application", "Platform", "Addon ID", "Version", "Name", "Measure", 23 | "Popularity", "Impact", "Median (ms)", "75% (ms)", "95% (ms)"], 24 | "url-prefix": "https://s3-us-west-2.amazonaws.com/telemetry-public-analysis/addon_perf/data/weekly_addons" 25 | } 26 | -------------------------------------------------------------------------------- /mapreduce/addon_perf/addon-scan.json: -------------------------------------------------------------------------------- 1 | { 2 | "sort-options": { 3 | "values": ["Impact", "Popularity", "Median Count", "Median Time (ms)", "75% (ms)"], 4 | "selected": "Impact" 5 | }, 6 | "filter-options": [ 7 | {"id": "Application", 8 | "values": ["", "Firefox", "Fennec"], 9 | "selected": ""}, 10 | {"id": "Platform", 11 | "values": ["", "WINNT", "Linux", "Darwin", "Android"], 12 | "selected": ""}, 13 | {"id": "Limit", 14 | "values": [10, 20, 50], 15 | "selected": 10} 16 | ], 17 | "title": ["Unpacked Add-on File Scan", "Start-up scan times for unpacked add-ons"], 18 | "primary-key": ["Application", "Platform", "Addon ID", "Version"], 19 | "header": ["Application", "Platform", "Addon ID", "Version", "Name", "Popularity", 20 | "Impact", "Median Count", "Median Time (ms)", "75% (ms)", "95% (ms)"], 21 | "url-prefix": "https://s3-us-west-2.amazonaws.com/telemetry-public-analysis/addon_perf/data/weekly_unpacked" 22 | } 23 | -------------------------------------------------------------------------------- /mapreduce/addon_perf/addon-versions.py: -------------------------------------------------------------------------------- 1 | # Process daily add-on telemetry extract to see variation in add-on version # 2 | # usage: addon-versions.py input-filename [input-filename ...] 3 | 4 | import unicodecsv as ucsv 5 | import simplejson as json 6 | import gzip 7 | import sys 8 | import re 9 | from collections import defaultdict, Counter 10 | 11 | APP_COLUMN=1 12 | OS_COLUMN=2 13 | VER_COLUMN=3 14 | CHAN_COLUMN=4 15 | TEXT_COLUMN=5 16 | 17 | # Keep track of how many version #s we see for an add-on ID 18 | addonVersions = defaultdict(Counter) 19 | # Keep track of how many different names we see for a given add-on ID 20 | addonNames = defaultdict(Counter) 21 | 22 | # Total number of pings received 23 | sessions = 0 24 | 25 | for a in sys.argv[1:]: 26 | print "processing", a 27 | with gzip.open(a, 'rb') as f: 28 | 29 | for line in f: 30 | try: 31 | keyblob, datablob = line.split("\t", 1) 32 | key = json.loads(keyblob) 33 | 34 | if key[0] == "E": 35 | if key[5] == 'Sessions': 36 | sessions += int(datablob) 37 | continue 38 | (addonID, sep, version) = key[5].rpartition(':') 39 | data = json.loads(datablob) 40 | addonVersions[addonID][version] += sum(data['name'].values()) 41 | addonNames[addonID].update(data['name']) 42 | except Exception as e: 43 | print "Bad line: " + str(e) + ": " + line 44 | continue 45 | 46 | # Get the most popular name for the add-on, collapsing ugly broken unicode 47 | rx = re.compile(u'\ufffd+') 48 | def getName(addonID): 49 | names = addonNames.get(addonID, {}) 50 | if "?" in names: 51 | del names["?"] 52 | if len(names) < 1: 53 | return "?" 54 | return max(names, key=names.get) 55 | 56 | print sessions, "sessions,", len(addonVersions), "different add-on IDs" 57 | 58 | # Things worth knowing? 59 | # Total different add-on IDs 60 | # total # sessions 61 | # count of IDs that have more than one version 62 | # for each ID: most popular name, total count, # versions, count of most popular version 63 | 64 | writer = ucsv.writer(sys.stdout) 65 | writer.writerow(['Add-on ID', 'name', 'total', 'versions', 'mainVersion', 'count']) 66 | 67 | # add-ons with more than one version 68 | multiversion = 0 69 | 70 | for addonID, counts in addonVersions.iteritems(): 71 | name = getName(addonID) 72 | total = sum(counts.values()) 73 | versions = len(counts) 74 | if '?' in counts: 75 | # Don't count the 'disabled' version 76 | versions = versions - 1 77 | if versions < 2: 78 | continue 79 | 80 | multiversion += 1 81 | version, count = max(counts.iteritems(), key=lambda k:k[1]) 82 | 83 | writer.writerow([addonID, name, total, versions, version, count]) 84 | 85 | print 86 | print multiversion, "add-ons with more than one version" 87 | -------------------------------------------------------------------------------- /mapreduce/addon_perf/filter_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["saved-session"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": "*" 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": "*" 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": "*" 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": ["__TARGET_DATE__"] 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /mapreduce/addon_perf/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | VERSION=0.2 3 | NAME=addon_perf 4 | TARBALL=${NAME}-$VERSION.tar.gz 5 | 6 | if [ -f "$TARBALL" ]; then 7 | rm -v "$TARBALL" 8 | fi 9 | tar czvf "$TARBALL" \ 10 | run.sh \ 11 | README.md 12 | -------------------------------------------------------------------------------- /mapreduce/addon_perf/processAddonPerf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASE=$(pwd) 4 | echo "Working in directory $BASE" 5 | 6 | WORK="$BASE/work" 7 | OUTPUT="$BASE/output" 8 | TODAY=$(date +%Y%m%d) 9 | mkdir -p "$OUTPUT" 10 | mkdir -p "$WORK/cache" 11 | 12 | # If we have an argument, process that day. 13 | TARGET=$1 14 | if [ -z "$TARGET" ]; then 15 | # Default to processing "yesterday" 16 | TARGET=$(date -d 'yesterday' +%Y%m%d) 17 | fi 18 | 19 | cd telemetry-server 20 | JOB="mapreduce/addon_perf" 21 | 22 | FILTER="$WORK/filter.json" 23 | echo "Today is $TODAY, and we're gathering addon_perf data for $TARGET" 24 | sed -r "s/__TARGET_DATE__/$TARGET/" $JOB/filter_template.json > $FILTER 25 | 26 | DATA_FILE="$OUTPUT/addon_perf${TARGET}.csv" 27 | 28 | echo "Starting the addon_perf export for $TARGET" 29 | python -u -m mapreduce.job $JOB/addon_perf.py \ 30 | --num-mappers 8 \ 31 | --num-reducers 8 \ 32 | --input-filter $FILTER \ 33 | --data-dir "$WORK/cache" \ 34 | --work-dir $WORK \ 35 | --output $DATA_FILE \ 36 | --bucket telemetry-published-v2 37 | 38 | echo "Mapreduce job exited with code: $?" 39 | 40 | echo "compressing" 41 | gzip $DATA_FILE 42 | echo "Done!" 43 | 44 | echo "Processing weekly data" 45 | cd $BASE 46 | mkdir -p "weekly" 47 | cd weekly 48 | 49 | # Monday is day 1 50 | OFFSET=$(( $(date -d $TARGET +%u) - 1 )) 51 | MONDAY=$(date -d "$TARGET - $OFFSET days" +%Y%m%d) 52 | SUNDAY=$(date -d "$MONDAY + 6 days" +%Y%m%d) 53 | echo "For target '$TARGET', week is $MONDAY to $SUNDAY" 54 | for f in $(seq 0 6); do 55 | DAY=$(date -d "$MONDAY + $f days" +%Y%m%d) 56 | if [ "$DAY" -eq "$TARGET" ]; then 57 | echo "Using local file for today ($DAY)" 58 | cp ${DATA_FILE}.gz . 59 | else 60 | echo "Fetching $DAY" 61 | aws s3 cp s3://telemetry-public-analysis/addon_perf/data/addon_perf$DAY.csv.gz ./addon_perf$DAY.csv.gz 62 | fi 63 | done 64 | echo "Creating weekly data for $MONDAY to $SUNDAY" 65 | python $BASE/telemetry-server/$JOB/combine.py "$OUTPUT" "$MONDAY" * 66 | echo "Created weekly output file:" 67 | ls -l $OUTPUT/ 68 | 69 | echo "Copying iacomus configs to s3" 70 | cp $BASE/telemetry-server/$JOB/addon-perf.json $BASE/telemetry-server/$JOB/addon-scan.json $OUTPUT 71 | -------------------------------------------------------------------------------- /mapreduce/addon_perf/run-fork.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install additional python modules used by addon_perf analysis 4 | sudo pip install unicodecsv 5 | 6 | # Replace the default telemetry-server install with our own 7 | rm -rf telemetry-server 8 | git clone https://github.com/irvingreid/telemetry-server.git 9 | (cd telemetry-server; git checkout addon-nightly) 10 | 11 | # Now run the actually processing job, using the code from Irving's github 12 | time telemetry-server/mapreduce/addon_perf/processAddonPerf.sh 13 | -------------------------------------------------------------------------------- /mapreduce/addon_perf/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install additional python modules used by addon_perf analysis 4 | sudo pip install unicodecsv 5 | 6 | # Now run the actually processing job 7 | telemetry-server/mapreduce/addon_perf/processAddonPerf.sh 8 | -------------------------------------------------------------------------------- /mapreduce/chromehangs/chromehangs.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | # ChromeHangs export, ported from: 6 | # https://github.com/mozilla-metrics/telemetry-toolbox 7 | 8 | try: 9 | import simplejson as json 10 | except ImportError: 11 | import json 12 | 13 | def check_obj(key, o): 14 | return len(o.get(key, {}).get("memoryMap", [])) > 0 15 | 16 | def map(k, v, cx): 17 | try: 18 | o = v["payload"] 19 | if check_obj("chromeHangs", o) or check_obj("lateWrites", o): 20 | # see https://github.com/mozilla/python_moztelemetry/issues/8 21 | cx.write(k, json.dumps({"chromeHangs": dict(o.get("chromeHangs", {}).items()), 22 | "lateWrites": dict(o.get("lateWrites", {}).items()), 23 | "meta": v.get("meta", {})})) 24 | except Exception as e: 25 | print str(e) 26 | -------------------------------------------------------------------------------- /mapreduce/chromehangs/combine_week.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If we have a target argument, process that day. 4 | TARGET=$1 5 | if [ -z "$TARGET" ]; then 6 | # Default to processing "yesterday" 7 | TARGET=$(date -d 'yesterday' +%Y%m%d) 8 | fi 9 | NAME=$2 10 | if [ -z "$NAME" ]; then 11 | NAME=ChromeHangsWeekly 12 | fi 13 | 14 | OUTPUT=$3 15 | if [ -z "$OUTPUT" ]; then 16 | OUTPUT=output 17 | fi 18 | 19 | BASE=$(pwd) 20 | DATA_FILE=$BASE/$OUTPUT/chromehangs-common-$TARGET.csv.gz 21 | 22 | if [ ! -d "weekly" ]; then 23 | mkdir -p "weekly" 24 | fi 25 | cd weekly 26 | # Monday is day 1 27 | OFFSET=$(( $(date -d $TARGET +%u) - 1 )) 28 | MONDAY=$(date -d "$TARGET - $OFFSET days" +%Y%m%d) 29 | SUNDAY=$(date -d "$MONDAY + 6 days" +%Y%m%d) 30 | echo "For target '$TARGET', week is $MONDAY to $SUNDAY" 31 | for f in $(seq 0 6); do 32 | DAY=$(date -d "$MONDAY + $f days" +%Y%m%d) 33 | if [ "$DAY" -eq "$TARGET" -a -f "$DATA_FILE" ]; then 34 | echo "Using target local file for today ($DAY)" 35 | cp ${DATA_FILE} ./ 36 | elif [ -f "$BASE/chromehangs-common-$DAY.csv.gz" ]; then 37 | echo "Already have local file for $DAY" 38 | cp "$BASE/chromehangs-common-$DAY.csv.gz" ./ 39 | else 40 | echo "Fetching $DAY" 41 | aws s3 cp s3://telemetry-public-analysis-2/$NAME/data/chromehangs-common-$DAY.csv.gz ./ 42 | fi 43 | done 44 | echo "Creating weekly data for $MONDAY to $SUNDAY" 45 | python $BASE/combine.py $BASE/$OUTPUT $MONDAY $SUNDAY 46 | echo "Done!" 47 | -------------------------------------------------------------------------------- /mapreduce/chromehangs/filter_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "prefix", 6 | "allowed_values": "telemetry-2" 7 | }, 8 | { 9 | "field_name": "submissionDate", 10 | "allowed_values": ["__TARGET_DATE__"] 11 | }, 12 | { 13 | "field_name": "sourceName", 14 | "allowed_values": "telemetry" 15 | }, 16 | { 17 | "field_name": "sourceVersion", 18 | "allowed_values": "4" 19 | }, 20 | { 21 | "field_name": "docType", 22 | "allowed_values": "saved_session" 23 | }, 24 | { 25 | "field_name": "appName", 26 | "allowed_values": "Firefox" 27 | }, 28 | { 29 | "field_name": "appUpdateChannel", 30 | "allowed_values": "nightly" 31 | }, 32 | { 33 | "field_name": "appVersion", 34 | "allowed_values": "*" 35 | }, 36 | { 37 | "field_name": "appBuildId", 38 | "allowed_values": "*" 39 | } 40 | ] 41 | } 42 | -------------------------------------------------------------------------------- /mapreduce/chromehangs/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | VERSION=0.4 3 | NAME=chromehangs 4 | TARBALL=${NAME}-$VERSION.tar.gz 5 | 6 | if [ -f "$TARBALL" ]; then 7 | rm -v "$TARBALL" 8 | fi 9 | tar czvf "$TARBALL" \ 10 | filter_template.json \ 11 | run.sh \ 12 | run_public.sh \ 13 | symbolicate.py \ 14 | extract_common_stacks.py \ 15 | combine.py \ 16 | combine_week.sh \ 17 | chromehangs.py 18 | 19 | echo "Packaged $NAME code as $TARBALL" 20 | -------------------------------------------------------------------------------- /mapreduce/chromehangs/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OUTPUT=output 4 | NAME=ChromeHangs 5 | TODAY=$(date +%Y%m%d) 6 | if [ ! -d "$OUTPUT" ]; then 7 | mkdir -p "$OUTPUT" 8 | fi 9 | 10 | if [ ! -d "temp" ]; then 11 | mkdir -p "temp" 12 | fi 13 | if [ ! -d "work" ]; then 14 | mkdir -p "work" 15 | fi 16 | if [ ! -d "data" ]; then 17 | mkdir -p "data" 18 | fi 19 | 20 | # If we have an argument, process that day. 21 | TARGET=$1 22 | if [ -z "$TARGET" ]; then 23 | # Default to processing "yesterday" 24 | TARGET=$(date -d 'yesterday' +%Y%m%d) 25 | fi 26 | 27 | echo "Today is $TODAY, and we're gathering $NAME data for $TARGET" 28 | sed -r "s/__TARGET_DATE__/$TARGET/" filter_template.json > filter.json 29 | 30 | BASE=$(pwd) 31 | RAW_DATA_FILE=$BASE/$OUTPUT/chromehangs-raw-$TARGET.txt 32 | FINAL_DATA_FILE=$BASE/$OUTPUT/chromehangs-symbolicated-$TARGET.txt.gz 33 | COMBINED_DATA_FILE=$BASE/$OUTPUT/chromehangs-common-$TARGET.txt 34 | 35 | cd ~/telemetry-server 36 | echo "Starting the $NAME export for $TARGET" 37 | python -u -m mapreduce.hekajob $BASE/chromehangs.py \ 38 | --delete-data \ 39 | --num-mappers 16 \ 40 | --input-filter $BASE/filter.json \ 41 | --data-dir $BASE/data \ 42 | --work-dir $BASE/work \ 43 | --output $RAW_DATA_FILE \ 44 | --bucket "net-mozaws-prod-us-west-2-pipeline-data" 45 | 46 | echo "Mapreduce job exited with code: $?" 47 | 48 | cd - 49 | echo "Looking for 'error' lines:" 50 | grep -e "^Error," $RAW_DATA_FILE 51 | echo "End of error lines." 52 | 53 | echo "Symbolicating outputs..." 54 | time python symbolicate.py -i $RAW_DATA_FILE -o $FINAL_DATA_FILE -d $TARGET &> symbolicate.out 55 | SYMBOLICATE_CODE=$? 56 | 57 | if [ $SYMBOLICATE_CODE -eq 0 ]; then 58 | echo "Symbolication succeeded (exited with code $SYMBOLICATE_CODE)" 59 | else 60 | echo "Symbolication failed (exited with code $SYMBOLICATE_CODE). Log:" 61 | cat symbolicate.out 62 | fi 63 | 64 | echo "Extracting common stacks..." 65 | time python extract_common_stacks.py -i $FINAL_DATA_FILE -o $COMBINED_DATA_FILE 66 | 67 | echo "Compressing raw output..." 68 | gzip $RAW_DATA_FILE 69 | 70 | echo "Compressing combined stacks..." 71 | gzip $COMBINED_DATA_FILE 72 | 73 | echo "Done!" 74 | -------------------------------------------------------------------------------- /mapreduce/chromehangs/run_public.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OUTPUT=output 4 | NAME=ChromeHangsWeekly 5 | TODAY=$(date +%Y%m%d) 6 | if [ ! -d "$OUTPUT" ]; then 7 | mkdir -p "$OUTPUT" 8 | fi 9 | 10 | if [ ! -d "temp" ]; then 11 | mkdir -p "temp" 12 | fi 13 | if [ ! -d "work" ]; then 14 | mkdir -p "work" 15 | fi 16 | if [ ! -d "data" ]; then 17 | mkdir -p "data" 18 | fi 19 | 20 | # If we have an argument, process that day. 21 | TARGET=$1 22 | if [ -z "$TARGET" ]; then 23 | # Default to processing "yesterday" 24 | TARGET=$(date -d 'yesterday' +%Y%m%d) 25 | fi 26 | 27 | echo "Today is $TODAY, and we're gathering $NAME data for $TARGET" 28 | sed -r "s/__TARGET_DATE__/$TARGET/" filter_template.json > filter.json 29 | 30 | BASE=$(pwd) 31 | RAW_DATA_FILE=$BASE/chromehangs-raw-$TARGET.txt 32 | FINAL_DATA_FILE=$BASE/chromehangs-symbolicated-$TARGET.txt.gz 33 | COMBINED_DATA_FILE=$BASE/$OUTPUT/chromehangs-common-$TARGET.csv 34 | 35 | cd ~/telemetry-server 36 | echo "Starting the $NAME export for $TARGET" 37 | python -u -m mapreduce.hekajob $BASE/chromehangs.py \ 38 | --delete-data \ 39 | --num-mappers 16 \ 40 | --input-filter $BASE/filter.json \ 41 | --data-dir $BASE/data \ 42 | --work-dir $BASE/work \ 43 | --output $RAW_DATA_FILE \ 44 | --bucket "net-mozaws-prod-us-west-2-pipeline-data" 45 | 46 | echo "Mapreduce job exited with code: $?" 47 | 48 | cd - 49 | echo "Looking for 'error' lines:" 50 | grep -e "^Error," $RAW_DATA_FILE 51 | echo "End of error lines." 52 | 53 | echo "Symbolicating outputs..." 54 | time python symbolicate.py -i $RAW_DATA_FILE -o $FINAL_DATA_FILE -d $TARGET &> symbolicate.out 55 | SYMBOLICATE_CODE=$? 56 | 57 | if [ $SYMBOLICATE_CODE -eq 0 ]; then 58 | echo "Symbolication succeeded (exited with code $SYMBOLICATE_CODE)" 59 | else 60 | echo "Symbolication failed (exited with code $SYMBOLICATE_CODE). Log:" 61 | cat symbolicate.out 62 | fi 63 | 64 | echo "Extracting common stacks..." 65 | time python extract_common_stacks.py -i $FINAL_DATA_FILE -o $COMBINED_DATA_FILE 66 | 67 | echo "Compressing combined stacks..." 68 | gzip $COMBINED_DATA_FILE 69 | 70 | echo "Processing weekly data" 71 | cd $BASE 72 | bash combine_week.sh "$TARGET" "$NAME" "$OUTPUT" 73 | echo "Done!" 74 | -------------------------------------------------------------------------------- /mapreduce/examples/heka/distribution.py: -------------------------------------------------------------------------------- 1 | # Same as the osdistribution.py example in jydoop 2 | import json 3 | 4 | def map(k, v, cx): 5 | os = v['environment']['system']['os']['name'] 6 | cx.write(os, 1) 7 | 8 | def reduce(k, v, cx): 9 | cx.write(k, sum(v)) 10 | -------------------------------------------------------------------------------- /mapreduce/examples/heka/filter.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "prefix", 6 | "allowed_values": "telemetry-2" 7 | }, 8 | { 9 | "field_name": "submissionDate", 10 | "allowed_values": "20150824" 11 | }, 12 | { 13 | "field_name": "sourceName", 14 | "allowed_values": "telemetry" 15 | }, 16 | { 17 | "field_name": "sourceVersion", 18 | "allowed_values": "4" 19 | }, 20 | { 21 | "field_name": "docType", 22 | "allowed_values": "saved_session" 23 | }, 24 | { 25 | "field_name": "appName", 26 | "allowed_values": "Firefox" 27 | }, 28 | { 29 | "field_name": "appUpdateChannel", 30 | "allowed_values": "nightly" 31 | }, 32 | { 33 | "field_name": "appVersion", 34 | "allowed_values": "43.0a1" 35 | }, 36 | { 37 | "field_name": "appBuildId", 38 | "allowed_values": "20150810122907" 39 | } 40 | ] 41 | } 42 | -------------------------------------------------------------------------------- /mapreduce/examples/heka/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd "$(dirname "$0")" 4 | DIR=$PWD 5 | cd ../../../ 6 | mkdir -p /tmp/telemetry/work/cache 7 | 8 | python -m mapreduce.hekajob $DIR/distribution.py \ 9 | --delete-data \ 10 | --input-filter $DIR/filter.json \ 11 | --num-mappers 16 \ 12 | --num-reducers 4 \ 13 | --data-dir /tmp/telemetry/work \ 14 | --work-dir /tmp/telemetry/work \ 15 | --output /tmp/telemetry/my_mapreduce_results.out \ 16 | --bucket "net-mozaws-prod-us-west-2-pipeline-data" 17 | -------------------------------------------------------------------------------- /mapreduce/examples/v2/dims_only.py: -------------------------------------------------------------------------------- 1 | def map(key, dims, value, context): 2 | submission_day = dims[-1] 3 | context.write(submission_day, 1) 4 | 5 | def reduce(key, values, context): 6 | context.write(key, sum(values)) 7 | -------------------------------------------------------------------------------- /mapreduce/examples/v2/distribution.py: -------------------------------------------------------------------------------- 1 | """ 2 | Get the distribution of one or more boolean/enumerated measurements. 3 | """ 4 | 5 | import json 6 | 7 | keys = [ 8 | ("NEWTAB_PAGE_SHOWN", 2), # boolean 9 | ("NEWTAB_PAGE_SITE_CLICKED", 10), # 9-bucket 10 | ] 11 | 12 | extra_histogram_entries = 6 # bucketN, sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi 13 | 14 | def map(k, d, v, cx): 15 | j = json.loads(v) 16 | histograms = j.get("histograms", {}) 17 | 18 | counts = () 19 | for key, buckets in keys: 20 | if key in histograms: 21 | val = histograms[key] 22 | if len(val) != buckets + extra_histogram_entries: 23 | raise ValueError("Unexpected length for key %s: %s" % (key, val)) 24 | counts += tuple(val[0:buckets]) 25 | else: 26 | counts += (0,) * buckets 27 | cx.write(counts, 1) 28 | 29 | def reduce(k, v, cx): 30 | cx.writecsv(list(k) + [sum(v)]) 31 | -------------------------------------------------------------------------------- /mapreduce/examples/v2/filter-nightly-buildid.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["saved-session", "idle-daily"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": ["Firefox"] 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": ["nightly"] 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": { 23 | "min": "20140228" 24 | } 25 | }, 26 | { 27 | "field_name": "submission_date", 28 | "allowed_values": { 29 | "min": "20140308", 30 | "max": "20140309" 31 | } 32 | } 33 | ] 34 | } 35 | -------------------------------------------------------------------------------- /mapreduce/examples/v2/filter_include_all.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": "*" 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": "*" 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": "*" 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": "*" 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": "*" 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /mapreduce/examples/v2/filter_max_buildid.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["saved-session"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": ["Firefox","Fennec"] 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": ["nightly", "aurora"] 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": { "max": "20130607" } 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": "*" 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /mapreduce/examples/v2/filter_min_buildid.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["saved-session"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": ["Firefox","Fennec"] 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": ["nightly", "aurora"] 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": { "min": "20130600" } 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": "*" 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /mapreduce/examples/v2/filter_min_max_buildid.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["saved-session"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": ["Firefox","Fennec"] 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": ["nightly", "aurora"] 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": { "min": "20130600", "max": "20130607" } 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": "*" 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /mapreduce/examples/v2/filter_saved_session_Fx_prerelease.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["saved-session"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": ["Firefox","Fennec"] 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": ["nightly", "aurora"] 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": "*" 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": { 27 | "min": "20131101", 28 | "max": "20131103" 29 | } 30 | } 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /mapreduce/examples/v2/osdistribution.py: -------------------------------------------------------------------------------- 1 | # Same as the osdistribution.py example in jydoop 2 | import json 3 | 4 | def map(k, d, v, cx): 5 | j = json.loads(v) 6 | os = j['info']['OS'] 7 | cx.write(os, 1) 8 | 9 | def reduce(k, v, cx): 10 | cx.write(k, sum(v)) 11 | -------------------------------------------------------------------------------- /mapreduce/examples/v2/simple_counter.py: -------------------------------------------------------------------------------- 1 | # A very simple MR job to simply count the number of 2 | # occurrences of each key. Useful for investigating 3 | # the number of duplicate submissions. 4 | 5 | def map(k, d, v, cx): 6 | cx.write(k, 1) 7 | 8 | def reduce(k, v, cx): 9 | cx.write(k, sum(v)) 10 | -------------------------------------------------------------------------------- /mapreduce/examples/v2/trivial.py: -------------------------------------------------------------------------------- 1 | def map(key, dims, value, context): 2 | context.write(key[0:3], 1) 3 | 4 | def reduce(key, values, context): 5 | context.write(key, sum([int(v) for v in values])) 6 | -------------------------------------------------------------------------------- /mapreduce/experiments/experiments.py: -------------------------------------------------------------------------------- 1 | # Experiments export 2 | import simplejson as json 3 | import traceback 4 | import sys 5 | import urllib 6 | 7 | def map(k, d, v, cx): 8 | [reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date] = d 9 | if appName != "Firefox": 10 | print >>sys.stderr, "Got non-Firefox appName", appName 11 | return 12 | 13 | cx.write(("Totals", appUpdateChannel, appVersion), 1) 14 | process = False 15 | if v.find("EXPERIMENT") != -1: 16 | process = True 17 | elif v.find("activeExperiment") != -1: 18 | process = True 19 | 20 | if not process: 21 | return 22 | 23 | try: 24 | j = json.loads(v) 25 | for item in j.get("log", []): 26 | entrytype = item[0] 27 | if entrytype == "EXPERIMENT_ACTIVATION": 28 | cx.write(("EXPERIMENT_ACTIVATION", 29 | appUpdateChannel, 30 | appVersion) + tuple(item[2:]), 1) 31 | elif entrytype == "EXPERIMENT_TERMINATION": 32 | cx.write(("EXPERIMENT_TERMINATION", 33 | appUpdateChannel, 34 | appVersion) + tuple(item[2:]), 1) 35 | 36 | info = j.get("info", {}) 37 | active = info.get("activeExperiment", None) 38 | if active is not None: 39 | activeBranch = info.get("activeExperimentBranch", None) 40 | cx.write(("ACTIVE", appUpdateChannel, appVersion, active, activeBranch), 1) 41 | 42 | except Exception as e: 43 | print >>sys.stderr, "Error during map: ", e 44 | cx.write(("Error",), "%s: %s\n%s" % (e, d, traceback.format_exc())) 45 | 46 | def reduce(k, v, cx): 47 | if k[0] == "Error": 48 | cx.writecsv(("Error", v)) 49 | else: 50 | cx.writecsv(list(k) + [sum(v)]) 51 | -------------------------------------------------------------------------------- /mapreduce/experiments/filter_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["saved-session"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": "Firefox" 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": "*" 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": "*" 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": ["__TARGET_DATE__"] 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /mapreduce/experiments/postprocess.py: -------------------------------------------------------------------------------- 1 | import sys, os, csv 2 | from collections import defaultdict 3 | import gzip 4 | import simplejson as json 5 | 6 | infile, outpattern = sys.argv[1:] 7 | 8 | class Experiment(object): 9 | def __init__(self): 10 | self.activeBranches = defaultdict(lambda: 0) 11 | self.activations = defaultdict(lambda: 0) 12 | self.terminations = defaultdict(lambda: 0) 13 | 14 | class Channel(object): 15 | def __init__(self): 16 | self.total = 0 17 | self.experiments = defaultdict(Experiment) 18 | 19 | def addTotal(self, c): 20 | self.total += c 21 | 22 | def addActive(self, id, branch, c): 23 | self.experiments[id].activeBranches[branch] += c 24 | 25 | def addActivation(self, id, data, c): 26 | self.experiments[id].activations[tuple(data)] += c 27 | 28 | def addTermination(self, id, data, c): 29 | self.experiments[id].terminations[tuple(data)] += c 30 | 31 | channels = defaultdict(lambda: Channel()) 32 | 33 | errors = [] 34 | 35 | lines = csv.reader(open(infile)) 36 | for line in lines: 37 | entrytype = line[0] 38 | if entrytype == "Error": 39 | errors.append(line[1]) 40 | continue 41 | 42 | if entrytype == "Totals": 43 | channel, version, count = line[1:] 44 | count = int(count) 45 | channels[channel].addTotal(count) 46 | elif entrytype == "EXPERIMENT_ACTIVATION": 47 | channel, version, reason, id = line[1:5] 48 | data = line[5:-1] 49 | count = int(line[-1]) 50 | channels[channel].addActivation(id, [reason] + data, count) 51 | elif entrytype == "EXPERIMENT_TERMINATION": 52 | channel, version, reason, id = line[1:5] 53 | data = line[5:-1] 54 | count = int(line[-1]) 55 | channels[channel].addTermination(id, [reason] + data, count) 56 | elif entrytype == "ACTIVE": 57 | channel, version, id, branch, count = line[1:] 58 | count = int(count) 59 | channels[channel].addActive(id, branch, count) 60 | else: 61 | raise ValueError("Unexpected data key, line %i: %s" % (lines.line_num, entrytype)) 62 | 63 | if len(errors): 64 | errorfd = gzip.open("%s-errors.txt.gz" % (outpattern,), "wb") 65 | for e in errors: 66 | print >>errorfd, e 67 | errorfd.close() 68 | 69 | channels = channels.items() 70 | channels.sort(key=lambda i: i[1].total, reverse=True) 71 | 72 | for cname, channel in channels: 73 | d = { 74 | "total": channel.total, 75 | "experiments": {}, 76 | } 77 | for id, experiment in channel.experiments.items(): 78 | d["experiments"][id] = { 79 | "active": experiment.activeBranches, 80 | "activations": experiment.activations.items(), 81 | "terminations": experiment.terminations.items(), 82 | } 83 | fd = gzip.open("%s-%s.json.gz" % (outpattern, cname), "wb") 84 | json.dump(d, fd) 85 | fd.close() 86 | -------------------------------------------------------------------------------- /mapreduce/experiments/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OUTPUT=output 4 | TODAY=$(date +%Y%m%d) 5 | if [ ! -d "$OUTPUT" ]; then 6 | mkdir -p "$OUTPUT" 7 | fi 8 | 9 | if [ ! -d "job" ]; then 10 | mkdir -p "job" 11 | fi 12 | if [ ! -d "work" ]; then 13 | mkdir -p "work" 14 | fi 15 | if [ ! -d "data" ]; then 16 | mkdir -p "data" 17 | fi 18 | 19 | # If we have an argument, process that day. 20 | TARGET=$1 21 | if [ -z "$TARGET" ]; then 22 | # Default to processing "yesterday" 23 | TARGET=$(date -d 'yesterday' +%Y%m%d) 24 | fi 25 | 26 | echo "Today is $TODAY, and we're gathering experiment data for $TARGET" 27 | sed -r "s/__TARGET_DATE__/$TARGET/" filter_template.json > filter.json 28 | 29 | BASE=$(pwd) 30 | FINAL_DATA_FILE=$BASE/$OUTPUT/experiments$TARGET 31 | RAW_DATA_FILE=$BASE/data.csv 32 | cd ~/telemetry-server 33 | echo "Starting the experiment export for $TARGET" 34 | python -u -m mapreduce.job $BASE/experiments.py \ 35 | --num-mappers 16 \ 36 | --num-reducers 4 \ 37 | --input-filter $BASE/filter.json \ 38 | --data-dir $BASE/data \ 39 | --work-dir $BASE/work \ 40 | --output $RAW_DATA_FILE \ 41 | --bucket telemetry-published-v2 42 | 43 | echo "Mapreduce job exited with code: $?" 44 | 45 | cd - 46 | 47 | grep -e "^Error," $RAW_DATA_FILE 48 | echo "End of error lines." 49 | 50 | echo "Adding header line and removing error lines..." 51 | python postprocess.py $RAW_DATA_FILE $FINAL_DATA_FILE 52 | echo "Removing temp file" 53 | rm $RAW_DATA_FILE 54 | echo "Listing:" 55 | ls -l $BASE/$OUTPUT/ 56 | echo "Done!" 57 | -------------------------------------------------------------------------------- /mapreduce/flash/csv_header.txt: -------------------------------------------------------------------------------- 1 | appName,appVersion,appUpdateChannel,os,osVersion,flashVersion,count 2 | -------------------------------------------------------------------------------- /mapreduce/flash/filter_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["saved-session"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": ["Firefox","Fennec","OTHER"] 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": "*" 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": "*" 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": ["__TARGET_DATE__"] 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /mapreduce/flash/flash_versions.py: -------------------------------------------------------------------------------- 1 | # Flash Versions export, ported from: 2 | # https://github.com/mozilla-metrics/telemetry-toolbox 3 | import simplejson as json 4 | import traceback 5 | 6 | def map(k, d, v, cx): 7 | try: 8 | j = json.loads(v) 9 | info = j.get("info", {}) 10 | if "OS" not in info: 11 | return 12 | if "appName" not in info: 13 | return 14 | 15 | os = info["OS"] 16 | appName = info["appName"] 17 | 18 | # Keep [Metro]Firefox documents on windows only 19 | if appName == "Firefox" or appName == "MetroFirefox": 20 | if os != "WINNT": 21 | return 22 | # Also keep all Fennec documents. 23 | elif appName != "Fennec": 24 | return 25 | 26 | out_dims = [appName] 27 | for f in ["appVersion", "appUpdateChannel"]: 28 | out_dims.append(info.get(f, "NA")) 29 | out_dims.append(os) 30 | for f in ["version", "flashVersion"]: 31 | out_dims.append(info.get(f, "NA")) 32 | 33 | cx.write(",".join([str(i) for i in out_dims]), 1) 34 | except Exception as e: 35 | cx.write(",".join(["Error", str(e), traceback.format_exc()] + d), 1) 36 | 37 | def setup_reduce(cx): 38 | cx.field_separator = "," 39 | 40 | def reduce(k, v, cx): 41 | cx.write(k, sum(v)) 42 | 43 | combine = reduce 44 | -------------------------------------------------------------------------------- /mapreduce/flash/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | VERSION=0.1 3 | NAME=flash_versions 4 | TARBALL=${NAME}-$VERSION.tar.gz 5 | 6 | if [ -f "$TARBALL" ]; then 7 | rm -v "$TARBALL" 8 | fi 9 | tar czvf "$TARBALL" \ 10 | csv_header.txt \ 11 | filter_template.json \ 12 | flash_versions.py \ 13 | run.sh 14 | 15 | S3PATH=s3://telemetry-analysis-code/$NAME/$TARBALL 16 | 17 | echo "Packaged $NAME code as $TARBALL" 18 | if [ ! -z "$(which aws)" ]; then 19 | aws s3 cp $TARBALL $S3PATH 20 | echo "Code successfully uploaded to S3" 21 | else 22 | echo "AWS CLI not found - you should manually upload to $S3PATH" 23 | fi 24 | -------------------------------------------------------------------------------- /mapreduce/flash/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OUTPUT=output 4 | TODAY=$(date +%Y%m%d) 5 | if [ ! -d "$OUTPUT" ]; then 6 | mkdir -p "$OUTPUT" 7 | fi 8 | 9 | if [ ! -d "job" ]; then 10 | mkdir -p "job" 11 | fi 12 | if [ ! -d "work" ]; then 13 | mkdir -p "work" 14 | fi 15 | if [ ! -d "data" ]; then 16 | mkdir -p "data" 17 | fi 18 | 19 | # If we have an argument, process that day. 20 | TARGET=$1 21 | if [ -z "$TARGET" ]; then 22 | # Default to processing "yesterday" 23 | TARGET=$(date -d 'yesterday' +%Y%m%d) 24 | fi 25 | 26 | echo "Today is $TODAY, and we're gathering flash versions for $TARGET" 27 | sed -r "s/__TARGET_DATE__/$TARGET/" filter_template.json > filter_flash.json 28 | 29 | BASE=$(pwd) 30 | cd ~/telemetry-server 31 | echo "Starting the flash versions export for $TARGET" 32 | python -u -m mapreduce.job $BASE/flash_versions.py \ 33 | --num-mappers 16 \ 34 | --input-filter $BASE/filter_flash.json \ 35 | --data-dir $BASE/data \ 36 | --work-dir $BASE/work \ 37 | --output $BASE/$OUTPUT/flash_versions$TARGET.csv.tmp \ 38 | --bucket telemetry-published-v2 39 | 40 | echo "Mapreduce job exited with code: $?" 41 | 42 | cd - 43 | echo "Looking for 'error' lines:" 44 | grep -e "^Error," $OUTPUT/flash_versions$TARGET.csv.tmp 45 | echo "End of error lines." 46 | 47 | echo "Adding header line and removing error lines..." 48 | cp csv_header.txt $OUTPUT/flash_versions$TARGET.csv 49 | grep -ve "^Error," $OUTPUT/flash_versions$TARGET.csv.tmp >> $OUTPUT/flash_versions$TARGET.csv 50 | echo "Removing temp file" 51 | rm $OUTPUT/flash_versions$TARGET.csv.tmp 52 | echo "Compressing output" 53 | gzip $OUTPUT/flash_versions$TARGET.csv 54 | echo "Done!" 55 | -------------------------------------------------------------------------------- /mapreduce/fxosping/csv_header.txt: -------------------------------------------------------------------------------- 1 | submission_date,os,software,time_to_ping,screen_width,screen_height,pixel_ratio,locale,hardware,model,firmware_revision,update_channel,icc_mnc,icc_mcc,icc_spn,network_mnc,network_mcc,network_operator,geo_country 2 | -------------------------------------------------------------------------------- /mapreduce/fxosping/filter_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["ftu"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": ["FirefoxOS"] 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": "*" 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": "*" 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": __TARGET_DATE__ 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /mapreduce/fxosping/fxosping.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | def map(key, dims, value, context): 4 | data = json.loads(value) 5 | reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = dims 6 | 7 | def dataval(key): 8 | return data.get(key, 'unknown') 9 | 10 | def strval(d, key): 11 | if not d: 12 | return 'unknown' 13 | return d.get(key, 'unknown') or 'unknown' 14 | 15 | hours = -1 16 | time_to_ping = 'unknown' 17 | if 'pingTime' in data and 'activationTime' in data: 18 | # Time to ping in hours 19 | hours = float(int(data['pingTime']) - int(data['activationTime'])) / (60 * 60 * 1000) 20 | time_to_ping = '%d' % round(hours) 21 | 22 | context.write(key, submission_date) 23 | context.write(key, strval(data, 'deviceinfo.os')) 24 | context.write(key, strval(data, 'deviceinfo.software')) 25 | context.write(key, time_to_ping) 26 | context.write(key, dataval('screenWidth')) 27 | context.write(key, dataval('screenHeight')) 28 | context.write(key, dataval('devicePixelRatio')) 29 | context.write(key, strval(data, 'locale')) 30 | context.write(key, strval(data, 'deviceinfo.hardware')) 31 | context.write(key, strval(data, 'deviceinfo.product_model')) 32 | context.write(key, strval(data, 'deviceinfo.firmware_revision')) 33 | context.write(key, appUpdateChannel) 34 | 35 | icc = data.get('icc') 36 | context.write(key, strval(icc, 'mnc')) 37 | context.write(key, strval(icc, 'mcc')) 38 | context.write(key, strval(icc, 'spn')) 39 | 40 | network = data.get('network') 41 | context.write(key, strval(network, 'mnc')) 42 | context.write(key, strval(network, 'mcc')) 43 | context.write(key, strval(network, 'operator')) 44 | 45 | info = data.get('info') 46 | context.write(key, strval(info, 'geoCountry')) 47 | 48 | def setup_reduce(context): 49 | context.field_separator = ',' 50 | 51 | def reduce(key, values, context): 52 | context.writecsv(values) 53 | -------------------------------------------------------------------------------- /mapreduce/fxosping/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | VERSION=0.1 3 | NAME=fxosping 4 | TARBALL=${NAME}-$VERSION.tar.gz 5 | BASE=$(pwd) 6 | THIS_DIR=$(cd "`dirname "$0"`"; pwd) 7 | 8 | if [ -f "$TARBALL" ]; then 9 | rm -v "$TARBALL" 10 | fi 11 | 12 | cd "$THIS_DIR" 13 | tar czvf "$BASE/$TARBALL" \ 14 | fxosping.py \ 15 | filter_template.json \ 16 | run.sh 17 | 18 | S3PATH=s3://telemetry-analysis-code/$NAME/$TARBALL 19 | 20 | echo "Packaged $NAME code as $TARBALL" 21 | if [ ! -z "$(which aws)" ]; then 22 | aws s3 cp $TARBALL $S3PATH 23 | echo "Code successfully uploaded to S3" 24 | else 25 | echo "AWS CLI not found - you should manually upload to $S3PATH" 26 | fi 27 | -------------------------------------------------------------------------------- /mapreduce/fxosping/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASE=$(pwd) 4 | THIS_DIR=$(cd "`dirname "$0"`"; pwd) 5 | TELEMETRY_SERVER_DIR=$(cd "$THIS_DIR/../.."; pwd) 6 | if [ ! -d "$TELEMETRY_SERVER_DIR/mapreduce" ]; then 7 | TELEMETRY_SERVER_DIR=$HOME/telemetry-server 8 | fi 9 | 10 | OUTPUT=${OUTPUT:-output} 11 | TODAY=$(date +%Y%m%d) 12 | 13 | if [ ! -d "$OUTPUT" ]; then 14 | mkdir -p "$OUTPUT" 15 | fi 16 | 17 | if [ ! -d "job" ]; then 18 | mkdir -p "job" 19 | fi 20 | 21 | if [ ! -d "work" ]; then 22 | mkdir -p "work" 23 | fi 24 | 25 | if [ ! -d "data" ]; then 26 | mkdir -p "data" 27 | fi 28 | 29 | # If we have an argument, process that day. 30 | TARGET=$1 31 | if [ -z "$TARGET" ]; then 32 | # Default to processing "yesterday" 33 | TARGET=$(date -d 'yesterday' +%Y%m%d) 34 | fi 35 | 36 | if [ "$TARGET" = "all" ]; then 37 | TARGET_DATE="\"*\"" 38 | else 39 | TARGET_DATE="[\"$TARGET\"]" 40 | fi 41 | 42 | echo "Today is $TODAY, and we're gathering fxosping data for '$TARGET'" 43 | 44 | sed -r "s/__TARGET_DATE__/$TARGET_DATE/" \ 45 | "$THIS_DIR/filter_template.json" > "$THIS_DIR/filter.json" 46 | 47 | cd "$TELEMETRY_SERVER_DIR" 48 | 49 | OUTPUT_FILE=$BASE/$OUTPUT/fxosping_$TARGET.csv 50 | TMP_OUTPUT_FILE=${OUTPUT_FILE}.tmp 51 | 52 | echo "Starting fxosping export for $TARGET" 53 | python -m mapreduce.job "$THIS_DIR/fxosping.py" \ 54 | --input-filter "$THIS_DIR/filter.json" \ 55 | --num-mappers 16 \ 56 | --num-reducers 4 \ 57 | --data-dir "$BASE/data" \ 58 | --work-dir "$BASE/work" \ 59 | --output "$TMP_OUTPUT_FILE" \ 60 | --bucket "telemetry-published-v2" 61 | 62 | echo "Mapreduce job exited with code: $?" 63 | 64 | echo "Adding header line" 65 | cp "$THIS_DIR/csv_header.txt" "$OUTPUT_FILE" 66 | cat "$TMP_OUTPUT_FILE" >> "$OUTPUT_FILE" 67 | 68 | echo "Removing temp file" 69 | rm "$TMP_OUTPUT_FILE" 70 | 71 | cd "$BASE" 72 | echo "Compressing output" 73 | gzip -f "$OUTPUT_FILE" 74 | 75 | echo "Done!" 76 | -------------------------------------------------------------------------------- /mapreduce/loop_failure_summary/failures_by_type.py: -------------------------------------------------------------------------------- 1 | import simplejson as json 2 | 3 | def map(k, d, v, cx): 4 | reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = d 5 | try: 6 | j = json.loads(v) 7 | 8 | # Filter just the ice failure reports: 9 | if "report" in j and j["report"] == "ice failure": 10 | cx.write(k, (submission_date, j.get("connectionstate", "UNKNOWN"), v)) 11 | except Exception as e: 12 | cx.write("ERROR", str(e)) 13 | 14 | def reduce(k, v, cx): 15 | if k == "ERROR": 16 | for err in v: 17 | cx.write(k, err) 18 | else: 19 | # data contains duplicates, so we just output the first record for each 20 | # key. 21 | submission_date, connectionstate, payload = v[0] 22 | cx.write(submission_date, "\t".join((connectionstate, payload))) 23 | -------------------------------------------------------------------------------- /mapreduce/loop_failure_summary/filter_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["loop"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": "*" 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": "*" 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": "*" 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": ["__TARGET_DATE__"] 27 | } 28 | ] 29 | } 30 | 31 | -------------------------------------------------------------------------------- /mapreduce/loop_failure_summary/header.txt: -------------------------------------------------------------------------------- 1 | submission_date failure_type payload 2 | -------------------------------------------------------------------------------- /mapreduce/loop_failure_summary/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Loop Telemetry 3 | 4 | OUTPUT=output 5 | TODAY=$(date +%Y%m%d) 6 | 7 | JOB_DIR=$(pwd) 8 | BASE=/mnt/telemetry 9 | cd $BASE 10 | if [ ! -d "$OUTPUT" ]; then 11 | mkdir -p "$OUTPUT" 12 | fi 13 | if [ ! -d "work" ]; then 14 | mkdir -p "work" 15 | fi 16 | 17 | if [ ! -d "data" ]; then 18 | mkdir -p "data" 19 | fi 20 | 21 | cd $JOB_DIR 22 | 23 | TARGET=$1 24 | if [ -z "$TARGET" ]; then 25 | # Default to processing "yesterday" 26 | TARGET=$(date -d 'yesterday' +%Y%m%d) 27 | fi 28 | 29 | echo "Today is $TODAY | Gathering data for $TARGET" 30 | sed -r "s/__TARGET_DATE__/$TARGET/" filter_template.json > filter.json 31 | 32 | FINAL_DATA_FILE=$BASE/$OUTPUT/$TARGET.tsv 33 | RAW_DATA_FILE=${FINAL_DATA_FILE}.tmp 34 | cd ~/telemetry-server 35 | echo "Starting the export for data on $TARGET" 36 | echo "running $BASE/failures_by_type.py" 37 | python -u -m mapreduce.job $JOB_DIR/failures_by_type.py \ 38 | --num-mappers 16 \ 39 | --num-reducers 1 \ 40 | --input-filter $JOB_DIR/filter.json \ 41 | --data-dir $BASE/data \ 42 | --work-dir $BASE/work \ 43 | --output $RAW_DATA_FILE \ 44 | --bucket telemetry-published-v2 45 | 46 | cat $JOB_DIR/header.txt > $FINAL_DATA_FILE 47 | cat $RAW_DATA_FILE >> $FINAL_DATA_FILE 48 | rm $RAW_DATA_FILE 49 | 50 | aws s3 cp s3://telemetry-private-analysis-2/loop_failures/data/failures_by_type.json $JOB_DIR/failures_by_type.json 51 | if [ -f "$JOB_DIR/failures_by_type.json" ]; then 52 | # back up the existing one 53 | cp $JOB_DIR/failures_by_type.json $BASE/$OUTPUT/failures_by_type.json.prev 54 | else 55 | # create an empty one. 56 | touch $JOB_DIR/failures_by_type.json 57 | fi 58 | python $JOB_DIR/summarize.py -i $FINAL_DATA_FILE -o $BASE/$OUTPUT/$TARGET.summary.json -c $JOB_DIR/failures_by_type.json -O $BASE/$OUTPUT/failures_by_type.json 59 | gzip $FINAL_DATA_FILE 60 | -------------------------------------------------------------------------------- /mapreduce/loop_failure_summary/summarize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import fileinput 3 | import json 4 | import sys 5 | 6 | def main(): 7 | parser = argparse.ArgumentParser(description='Summarize daily loop failures.') 8 | parser.add_argument("-i", "--input-file", help="Filename to read from", required=True, type=file) 9 | parser.add_argument("-o", "--summary-output", help="Filename to save day's data", required=True, type=argparse.FileType('w')) 10 | parser.add_argument("-c", "--combined-input", help="Filename to read combined daily data", type=file) 11 | parser.add_argument("-O", "--combined-output", help="Filename to save combined daily data", required=True, type=argparse.FileType('w')) 12 | args = parser.parse_args() 13 | 14 | headers = None 15 | date_idx = -1 16 | err_idx = -1 17 | date_map = {} 18 | for line in args.input_file: 19 | fields = line.split("\t") 20 | if headers is None: 21 | headers = fields 22 | try: 23 | date_idx = headers.index("submission_date") 24 | err_idx = headers.index("failure_type") 25 | except ValueError as e: 26 | print "Error: required field missing. We need 'submission_date' " \ 27 | "and 'failure_type' to generate a summary" 28 | return 2 29 | else: 30 | submission_date = fields[date_idx] 31 | failure_type = fields[err_idx] 32 | if submission_date not in date_map: 33 | date_map[submission_date] = {} 34 | 35 | if failure_type not in date_map[submission_date]: 36 | date_map[submission_date][failure_type] = 1 37 | else: 38 | date_map[submission_date][failure_type] += 1 39 | 40 | json.dump(date_map, args.summary_output) 41 | try: 42 | combined = json.load(args.combined_input) 43 | except: 44 | combined = [] 45 | 46 | current_index = 0 47 | # Insert each date into the correct spot in the array. 48 | for d in sorted(date_map.keys()): 49 | date_map[d]["date"] = d 50 | while current_index < len(combined) and d > combined[current_index]["date"]: 51 | current_index += 1 52 | 53 | # if the date is already there, overwrite with new values 54 | if len(combined) > current_index and combined[current_index]["date"] == d: 55 | for k in date_map[d].keys(): 56 | combined[current_index][k] = date_map[d][k] 57 | else: 58 | combined.insert(current_index, date_map[d]) 59 | # Output last 180 days 60 | json.dump(combined[-180:], args.combined_output) 61 | return 0 62 | 63 | if __name__ == "__main__": 64 | sys.exit(main()) 65 | -------------------------------------------------------------------------------- /mapreduce/mainthreadio/csv_header.txt: -------------------------------------------------------------------------------- 1 | app_name,interval,filename,submission_count,median_time,median_count 2 | -------------------------------------------------------------------------------- /mapreduce/mainthreadio/filter_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["saved-session"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": "*" 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": "nightly" 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": {"min": "__BID_BEGIN__", "max": "__BID_END__999999"} 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": {"min": "__BEGIN__", "max": "__END__999999"} 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /mapreduce/mainthreadio/mainthreadio.py: -------------------------------------------------------------------------------- 1 | import simplejson as json 2 | import numpy 3 | import io 4 | import csv 5 | from string import maketrans 6 | 7 | def clean(s): 8 | return normalize(s).translate(None, ",") 9 | 10 | def normalize(s): 11 | if type(s) == unicode: 12 | return s.encode('utf8', 'ignore') 13 | else: 14 | return str(s) 15 | 16 | def safe_key(pieces): 17 | output = io.BytesIO() 18 | writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL) 19 | writer.writerow(pieces) 20 | return output.getvalue().strip() 21 | 22 | def map(k, d, v, cx): 23 | global n_pings 24 | 25 | if "fileIOReports" not in v or '"fileIOReports":null' in v: 26 | return 27 | 28 | parsed = json.loads(v) 29 | reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = d 30 | 31 | if not "fileIOReports" in parsed: 32 | return 33 | 34 | if not parsed["fileIOReports"]: 35 | return 36 | 37 | startup_sub = False 38 | execution_sub = False 39 | shutdown_sub = False 40 | 41 | for f, arr in parsed["fileIOReports"].iteritems(): 42 | if len(arr) != 3: # Don't support the old format 43 | continue 44 | 45 | if arr[0] is not None: 46 | cx.write(safe_key([appName, "startup", clean(f)]), [arr[0][0], sum(arr[0][1:])]) 47 | if not startup_sub: 48 | cx.write(safe_key([appName, "startup", "TOTAL"]), [0, 0]) 49 | startup_sub = True 50 | 51 | if arr[1] is not None: 52 | cx.write(safe_key([appName, "execution", clean(f)]), [arr[1][0], sum(arr[1][1:])]) 53 | if not execution_sub: 54 | cx.write(safe_key([appName, "execution", "TOTAL"]), [0, 0]) 55 | execution_sub = True 56 | 57 | if arr[2] is not None: 58 | cx.write(safe_key([appName, "shutdown", clean(f)]), [arr[2][0], sum(arr[2][1:])]) 59 | if not shutdown_sub: 60 | cx.write(safe_key([appName, "shutdown", "TOTAL"]), [0, 0]) 61 | shutdown_sub = True 62 | 63 | def setup_reduce(cx): 64 | cx.field_separator = "," 65 | 66 | def reduce(k, v, cx): 67 | totals = [] 68 | counts = [] 69 | 70 | if len(v) > 100: 71 | sup = min(len(v), 10000) 72 | 73 | for total, count in v[:sup]: 74 | totals.append(total) 75 | counts.append(count) 76 | 77 | # Output fields: 78 | # app_name, interval, filename, submission_count, median_time, median_count 79 | cx.write(k, ",".join([str(len(v)), str(numpy.median(totals)), str(numpy.median(counts))])) 80 | -------------------------------------------------------------------------------- /mapreduce/mainthreadio/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | VERSION=0.2 3 | NAME=mainthreadio 4 | TARBALL=${NAME}-$VERSION.tar.gz 5 | 6 | if [ -f "$TARBALL" ]; then 7 | rm -v "$TARBALL" 8 | fi 9 | tar czvf "$TARBALL" run.sh 10 | 11 | S3PATH=s3://telemetry-analysis-code/$NAME/$TARBALL 12 | 13 | echo "Packaged $NAME code as $TARBALL" 14 | if [ ! -z "$(which aws)" ]; then 15 | aws s3 cp $TARBALL $S3PATH 16 | echo "Code successfully uploaded to S3" 17 | else 18 | echo "AWS CLI not found - you should manually upload to $S3PATH" 19 | fi 20 | -------------------------------------------------------------------------------- /mapreduce/mainthreadio/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd $(cd -P -- "$(dirname -- "$0")" && pwd -P) 4 | sudo apt-get --yes install python-numpy git 5 | 6 | rm -rf telemetry-server 7 | git clone https://github.com/mozilla/telemetry-server.git 8 | cd telemetry-server/mapreduce/mainthreadio 9 | 10 | OUTPUT=output 11 | TODAY=$(date +%Y%m%d) 12 | if [ ! -d "$OUTPUT" ]; then 13 | mkdir -p "$OUTPUT" 14 | fi 15 | 16 | if [ ! -d "job" ]; then 17 | mkdir -p "job" 18 | fi 19 | if [ ! -d "work" ]; then 20 | mkdir -p "work" 21 | fi 22 | if [ ! -d "data" ]; then 23 | mkdir -p "data" 24 | fi 25 | 26 | # If we have an argument, process that week. 27 | DAYS=$1 28 | if [ -z "$DAYS" ]; then 29 | # Default to processing "last week" 30 | DAYS=0 31 | fi 32 | 33 | BEGIN=$(date -d "$TODAY - $DAYS days - 1 weeks" +%Y%m%d) 34 | END=$(date -d "TODAY - $DAYS days" +%Y%m%d) 35 | BID_BEGIN=$BEGIN 36 | BID_END=$BEGIN 37 | TARGET=$BID_BEGIN 38 | 39 | echo "Today is $TODAY, and we're gathering mainthreadio data from $BEGIN to $END for build-ids from $BID_BEGIN to $BID_END" 40 | sed -e "s/__BEGIN__/$BEGIN/" -e "s/__END__/$END/" -e "s/__BID_BEGIN__/$BID_BEGIN/" -e "s/__BID_END__/$BID_END/" filter_template.json > filter.json 41 | 42 | BASE=$(pwd) 43 | FINAL_DATA_FILE=$BASE/$OUTPUT/buildid_$TARGET.csv 44 | RAW_DATA_FILE=${FINAL_DATA_FILE}.tmp 45 | 46 | cd ../../ 47 | echo "Starting the mainthreadio export for $TARGET" 48 | python -u -m mapreduce.job $BASE/mainthreadio.py \ 49 | --num-mappers 16 \ 50 | --num-reducers 4 \ 51 | --input-filter $BASE/filter.json \ 52 | --data-dir $BASE/data \ 53 | --work-dir $BASE/work \ 54 | --output $RAW_DATA_FILE \ 55 | --bucket telemetry-published-v2 #--data-dir $BASE/work/cache --local-only 56 | 57 | echo "Mapreduce job exited with code: $?" 58 | 59 | echo "Adding header line" 60 | cp $BASE/csv_header.txt $FINAL_DATA_FILE 61 | 62 | echo "Compute summaries" 63 | python $BASE/summary.py $RAW_DATA_FILE 64 | 65 | echo "Copying iacomus configuration" 66 | cp $BASE/iacomus.json $BASE/$OUTPUT 67 | 68 | cat $RAW_DATA_FILE >> $FINAL_DATA_FILE 69 | echo "Removing temp file" 70 | rm $RAW_DATA_FILE 71 | echo "Compressing output" 72 | gzip $FINAL_DATA_FILE 73 | echo "Done!" 74 | -------------------------------------------------------------------------------- /mapreduce/mainthreadio/summary.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import gzip 3 | import os 4 | import re 5 | import sys 6 | import numpy 7 | 8 | APP_COLUMN=0 9 | INTERVAL_COLUMN=1 10 | FILE_COLUMN=2 11 | SUBMISSION_COUNT_COLUMN=3 12 | MEDIAN_TIME_COLUMN=4 13 | MEDIAN_COUNT_COLUMN=5 14 | 15 | input = sys.argv[1] 16 | rows = None 17 | totals = {} 18 | 19 | def key(row): 20 | return str(row[APP_COLUMN]) + str(row[INTERVAL_COLUMN]) 21 | 22 | def parse(): 23 | global rows 24 | 25 | with open(input) as f: 26 | lines = f.readlines() 27 | rows = map(lambda x: x.split(','), lines) 28 | 29 | for i, row in enumerate(rows[:]): 30 | if row[FILE_COLUMN] == "TOTAL": 31 | totals[key(row)] = row 32 | rows.remove(row) 33 | 34 | def normalize(): 35 | global rows 36 | 37 | for row in rows: 38 | k = key(row) 39 | row[SUBMISSION_COUNT_COLUMN] = float(row[SUBMISSION_COUNT_COLUMN]) / float(totals[k][SUBMISSION_COUNT_COLUMN]) 40 | 41 | rows = sorted(rows, key=lambda x: x[SUBMISSION_COUNT_COLUMN], reverse=True) 42 | for row in rows: 43 | row[SUBMISSION_COUNT_COLUMN] = str(row[SUBMISSION_COUNT_COLUMN]) 44 | 45 | def dump(): 46 | with open(input, "w") as f: 47 | for row in rows: 48 | f.write(",".join(row)) 49 | 50 | parse() 51 | normalize() 52 | dump() 53 | -------------------------------------------------------------------------------- /mapreduce/slowsql/csv_header.txt: -------------------------------------------------------------------------------- 1 | thread_type,submission_date,app_name,app_version,app_update_channel,query,document_count,total_invocations,total_duration,median_duration 2 | -------------------------------------------------------------------------------- /mapreduce/slowsql/filter_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "prefix", 6 | "allowed_values": "telemetry-2" 7 | }, 8 | { 9 | "field_name": "submissionDate", 10 | "allowed_values": ["__TARGET_DATE__"] 11 | }, 12 | { 13 | "field_name": "sourceName", 14 | "allowed_values": "telemetry" 15 | }, 16 | { 17 | "field_name": "sourceVersion", 18 | "allowed_values": "4" 19 | }, 20 | { 21 | "field_name": "docType", 22 | "allowed_values": "saved_session" 23 | }, 24 | { 25 | "field_name": "appName", 26 | "allowed_values": "*" 27 | }, 28 | { 29 | "field_name": "appUpdateChannel", 30 | "allowed_values": "*" 31 | }, 32 | { 33 | "field_name": "appVersion", 34 | "allowed_values": "*" 35 | }, 36 | { 37 | "field_name": "appBuildId", 38 | "allowed_values": "*" 39 | } 40 | ] 41 | } 42 | -------------------------------------------------------------------------------- /mapreduce/slowsql/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | VERSION=0.4 3 | NAME=SlowSQL 4 | TARBALL=${NAME}-$VERSION.tar.gz 5 | if [ ! -d "combine.py" ]; then 6 | echo "Fetching 'combine.py' from github..." 7 | wget https://github.com/mreid-moz/slowsql-dashboard/raw/master/data/combine.py 8 | else 9 | echo "Using existing 'combine.py'" 10 | fi 11 | 12 | if [ -f "$TARBALL" ]; then 13 | rm -v "$TARBALL" 14 | fi 15 | tar czvf "$TARBALL" \ 16 | combine.py \ 17 | csv_header.txt \ 18 | filter_template.json \ 19 | run.sh \ 20 | slowsql.py 21 | 22 | echo "Packaged $NAME code as $TARBALL" 23 | -------------------------------------------------------------------------------- /mapreduce/slowsql/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OUTPUT=output 4 | NAME=SlowSQL 5 | TODAY=$(date +%Y%m%d) 6 | if [ ! -d "$OUTPUT" ]; then 7 | mkdir -p "$OUTPUT" 8 | fi 9 | 10 | if [ ! -d "job" ]; then 11 | mkdir -p "job" 12 | fi 13 | if [ ! -d "work" ]; then 14 | mkdir -p "work" 15 | fi 16 | if [ ! -d "data" ]; then 17 | mkdir -p "data" 18 | fi 19 | 20 | # If we have an argument, process that day. 21 | TARGET=$1 22 | if [ -z "$TARGET" ]; then 23 | # Default to processing "yesterday" 24 | TARGET=$(date -d 'yesterday' +%Y%m%d) 25 | fi 26 | 27 | echo "Today is $TODAY, and we're gathering slowsql data for $TARGET" 28 | sed -r "s/__TARGET_DATE__/$TARGET/" filter_template.json > filter.json 29 | 30 | BASE=$(pwd) 31 | FINAL_DATA_FILE=$BASE/$OUTPUT/slowsql$TARGET.csv 32 | RAW_DATA_FILE=${FINAL_DATA_FILE}.tmp 33 | cd ~/telemetry-server 34 | echo "Starting the slowsql export for $TARGET" 35 | python -u -m mapreduce.hekajob $BASE/slowsql.py \ 36 | --delete-data \ 37 | --num-mappers 16 \ 38 | --num-reducers 4 \ 39 | --input-filter $BASE/filter.json \ 40 | --data-dir $BASE/data \ 41 | --work-dir $BASE/work \ 42 | --output $RAW_DATA_FILE \ 43 | --bucket "net-mozaws-prod-us-west-2-pipeline-data" 44 | 45 | echo "Mapreduce job exited with code: $?" 46 | 47 | cd - 48 | echo "Looking for 'error' lines:" 49 | grep -e "^Error," $RAW_DATA_FILE 50 | echo "End of error lines." 51 | 52 | echo "Adding header line and removing error lines..." 53 | cp csv_header.txt $FINAL_DATA_FILE 54 | grep -ve "^Error," $RAW_DATA_FILE >> $FINAL_DATA_FILE 55 | echo "Removing temp file" 56 | rm $RAW_DATA_FILE 57 | echo "Compressing output" 58 | gzip $FINAL_DATA_FILE 59 | echo "Done!" 60 | 61 | echo "Processing weekly data" 62 | cd $BASE 63 | if [ ! -d "weekly" ]; then 64 | mkdir -p "weekly" 65 | fi 66 | cd weekly 67 | # Monday is day 1 68 | OFFSET=$(( $(date -d $TARGET +%u) - 1 )) 69 | MONDAY=$(date -d "$TARGET - $OFFSET days" +%Y%m%d) 70 | SUNDAY=$(date -d "$MONDAY + 6 days" +%Y%m%d) 71 | echo "For target '$TARGET', week is $MONDAY to $SUNDAY" 72 | for f in $(seq 0 6); do 73 | DAY=$(date -d "$MONDAY + $f days" +%Y%m%d) 74 | if [ "$DAY" -eq "$TARGET" ]; then 75 | echo "Using local file for today ($DAY)" 76 | cp $BASE/$OUTPUT/slowsql$DAY.csv.gz ./ 77 | else 78 | echo "Fetching $DAY" 79 | aws s3 cp s3://telemetry-public-analysis-2/$NAME/data/slowsql$DAY.csv.gz ./slowsql$DAY.csv.gz 80 | fi 81 | done 82 | echo "Creating weekly data for $MONDAY to $SUNDAY" 83 | python $BASE/combine.py $BASE/$OUTPUT $MONDAY $SUNDAY 84 | echo "Created weekly output files:" 85 | ls -l $BASE/$OUTPUT/ 86 | -------------------------------------------------------------------------------- /mongodb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/mongodb/__init__.py -------------------------------------------------------------------------------- /mongodb/examples/osdistribution.js: -------------------------------------------------------------------------------- 1 | printjson(db.payloads.mapReduce( 2 | function() { emit(this.info.OS, 1);}, 3 | function(key, values) { return Array.sum(values);}, 4 | { 5 | out: { inline: 1} 6 | })) 7 | -------------------------------------------------------------------------------- /monitoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/monitoring/__init__.py -------------------------------------------------------------------------------- /monitoring/anomaly_detection/notify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | # If stdin contains any non-whitespace data, send it as an email using SES. 9 | 10 | import argparse 11 | from boto.ses import connect_to_region as ses_connect 12 | import sys 13 | import traceback 14 | import simplejson as json 15 | 16 | if __name__ == "__main__": 17 | parser = argparse.ArgumentParser(description="Telemetry notifier") 18 | parser.add_argument("-c", "--config", help="Configuration file", type=file) 19 | parser.add_argument("-f", "--from-email", help="Email 'from:' address") 20 | parser.add_argument("-t", "--to-email", help="Email 'to:' address (multiple allowed)", action="append") 21 | parser.add_argument("-s", "--subject", help="Email Subject") 22 | parser.add_argument("-d", "--dry-run", help="Print out what would happen instead of sending email", action="store_true") 23 | args = parser.parse_args() 24 | 25 | message_body = sys.stdin.read().strip() 26 | 27 | if message_body == "": 28 | # nothing to notify about. 29 | if args.dry_run: 30 | print "Would not have sent any mail." 31 | else: 32 | if args.config: 33 | try: 34 | config = json.load(args.config) 35 | except: 36 | traceback.print_exc() 37 | config = {} 38 | else: 39 | config = {} 40 | 41 | if args.from_email: 42 | config["notify_from"] = args.from_email 43 | 44 | if args.to_email: 45 | config["notify_to"] = args.to_email 46 | 47 | if args.subject: 48 | config["notify_subject"] = args.subject 49 | 50 | if args.dry_run: 51 | print "Here is what we would have sent:" 52 | print " From:", config["notify_from"] 53 | print " To:", config["notify_to"] 54 | print "Subject:", config["notify_subject"] 55 | print " Body:", message_body 56 | else: 57 | ses = ses_connect('us-east-1') # only supported region! 58 | ses.send_email( 59 | source = config["notify_from"], 60 | subject = config["notify_subject"], 61 | format = "text", 62 | body = message_body, 63 | to_addresses = config["notify_to"] 64 | ) 65 | -------------------------------------------------------------------------------- /monitoring/expire_flash_video/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/monitoring/expire_flash_video/__init__.py -------------------------------------------------------------------------------- /monitoring/heka/common.toml: -------------------------------------------------------------------------------- 1 | [hekad] 2 | maxprocs = 4 3 | max_timer_inject = 100 4 | max_process_duration = 1000000 5 | 6 | [TCP:5565] 7 | type = "TcpInput" 8 | parser_type = "message.proto" 9 | decoder = "ProtobufDecoder" 10 | address = ":5565" 11 | [TCP:5565.signer.telemetry_0] 12 | hmac_key = "TODO change on deploy" # TODO update on deploy 13 | 14 | [TelemetrySandboxManager] 15 | type = "SandboxManagerFilter" 16 | message_signer = "telemetry" 17 | message_matcher = "Type == 'heka.control.sandbox'" 18 | max_filters = 10 19 | 20 | [Dashboard] 21 | type = "DashboardOutput" 22 | address = ":4352" 23 | ticker_interval = 10 24 | 25 | [TcpOutput] 26 | address = "10.250.68.186:5565" 27 | message_matcher = "Type == 'heka.sandbox-output' && Fields[payload_type] == 'cbufd'" 28 | -------------------------------------------------------------------------------- /monitoring/heka/incoming_stats.toml: -------------------------------------------------------------------------------- 1 | [TelemetryIncomingStatsInput] 2 | type = "LogstreamerInput" 3 | log_directory = "/mnt/telemetry/log" 4 | file_match = 'telemetry-incoming-stats\.log' 5 | decoder = "TelemetryIncomingStatsDecoder" 6 | 7 | [TelemetryIncomingStatsDecoder] 8 | type = "SandboxDecoder" 9 | script_type = "lua" 10 | filename = "lua_decoders/telemetry_incoming_stats.lua" 11 | 12 | [TelemetryStatsRecords] 13 | type = "SandboxFilter" 14 | message_matcher = "Type == 'telemetry.incoming_stats'" 15 | ticker_interval = 10 16 | script_type = "lua" 17 | filename = "lua_filters/telemetry_stats_records.lua" 18 | preserve_data = true 19 | 20 | [TelemetryStatsBytes] 21 | type = "SandboxFilter" 22 | message_matcher = "Type == 'telemetry.incoming_stats' && Fields[channel] == 'ALL'" 23 | ticker_interval = 10 24 | script_type = "lua" 25 | filename = "lua_filters/telemetry_stats_bytes.lua" 26 | preserve_data = true 27 | 28 | [TelemetryStatsErrors] 29 | type = "SandboxFilter" 30 | message_matcher = "Type == 'telemetry.incoming_stats' && Fields[channel] == 'ALL' && Fields[bad_records] > 0" 31 | ticker_interval = 10 32 | script_type = "lua" 33 | filename = "lua_filters/telemetry_stats_errors.lua" 34 | preserve_data = true 35 | -------------------------------------------------------------------------------- /monitoring/heka/lua_decoders/telemetry_server.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | --[[ 6 | sample input 7 | ------------ 8 | {"url":"/submit/sample","duration_ms":0.547324,"code":200,"size":4819,"level":"info","message":"OK","timestamp":"2013-09-10T20:43:17.217Z"} 9 | 10 | Injected Heka message 11 | --------------------- 12 | Timestamp: 2013-09-10 20:43:17.216999936 +0000 UTC 13 | Type: telemetry.server 14 | Hostname: trink-x230 15 | Pid: 0 16 | UUID: 2be3ed98-89e8-4bd0-a7c4-9aebe8747a8b 17 | Logger: TelemetryServerInput 18 | Payload: 19 | EnvVersion: 20 | Severity: 6 21 | Fields: [ 22 | name:"message" value_string:"OK" 23 | name:"code" value_type:DOUBLE value_double:200 24 | name:"url" value_string:"/submit/sample" 25 | name:"duration" value_type:DOUBLE representation:"ms" value_double:0.547324 26 | name:"size" value_type:DOUBLE representation:"B" value_double:4819 ] 27 | --]] 28 | 29 | require "cjson" 30 | 31 | local dt = require "date_time" 32 | local syslog = require "syslog" 33 | 34 | local metadata = { 35 | duration = {value=0, representation="ms"}, 36 | size = {value=0, representation="B"}, 37 | } 38 | 39 | local msg = { 40 | Timestamp = nil, 41 | Type = "telemetry.server", 42 | Severity = nil, 43 | Fields = nil 44 | } 45 | 46 | function process_message() 47 | json = cjson.decode(read_message("Payload")) 48 | if not json then return -1 end 49 | 50 | local t = lpeg.match(dt.rfc3339, json.timestamp) 51 | if not t then return -1 end 52 | msg.Timestamp = dt.time_to_ns(t) 53 | json.timestamp = nil 54 | 55 | msg.Severity = lpeg.match(syslog.severity, json.level) 56 | json.level = nil 57 | 58 | metadata.duration.value = json.duration_ms 59 | json.duration = metadata.duration 60 | json.duration_ms = nil 61 | 62 | metadata.size.value = json.size 63 | json.size = metadata.size 64 | 65 | msg.Fields = json 66 | if not pcall(inject_message, msg) then return -1 end 67 | 68 | return 0 69 | end 70 | -------------------------------------------------------------------------------- /monitoring/heka/lua_filters/telemetry_channel_metrics.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | require "circular_buffer" 6 | require "string" 7 | 8 | local rows = read_config("rows") or 1440 9 | local sec_per_row = read_config("sec_per_row") or 60 10 | local REQUESTS = 1 11 | local TOTAL_SIZE = 2 12 | 13 | channels = {} 14 | 15 | local function add_channel(channel) 16 | local c = circular_buffer.new(rows, 2, sec_per_row, true) 17 | c:set_header(REQUESTS, "Requests") 18 | c:set_header(TOTAL_SIZE, "Total Size", "KiB") 19 | channels[channel] = c 20 | return c 21 | end 22 | 23 | all = add_channel("ALL") 24 | 25 | function process_message () 26 | local ts = read_message("Timestamp") 27 | if not all:add(ts, REQUESTS, 1) then return 0 end -- outside the buffer 28 | 29 | local rs = read_message("Fields[size]") 30 | if rs then 31 | rs = rs / 1024 32 | else 33 | rs = 0 34 | end 35 | all:add(ts, TOTAL_SIZE, rs) 36 | 37 | local url = read_message("Fields[url]") 38 | local channel = url:match("^/submit/telemetry/[^/]+/[^/]+/[^/]+/[^/]+/([^/]+)") 39 | if not channel then return 0 end 40 | if channel ~= "release" and channel ~= "beta" and channel ~= "aurora" and channel ~= "nightly" then 41 | channel = "other" 42 | end 43 | 44 | local c = channels[channel] 45 | if not c then 46 | c = add_channel(channel) 47 | end 48 | c:add(ts, REQUESTS, 1) 49 | c:add(ts, TOTAL_SIZE, rs) 50 | 51 | return 0 52 | end 53 | 54 | function timer_event(ns) 55 | for k, v in pairs(channels) do 56 | inject_message(v:format("cbuf"), k) 57 | inject_message(v:format("cbufd"), k) 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /monitoring/heka/lua_filters/telemetry_server_metrics.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | require "circular_buffer" 6 | 7 | local rows = 1440 8 | local sec_per_row = 60 9 | 10 | request = circular_buffer.new(rows, 4, sec_per_row, true) 11 | local SUCCESS = request:set_header(1, "Success" , "count") 12 | local FAILURE = request:set_header(2, "Failure" , "count") 13 | local REQUEST_SIZE = request:set_header(3, "Request Size", "B") 14 | local REQUEST_TIME = request:set_header(4, "Request Time", "ms") 15 | 16 | function process_message () 17 | local ts = read_message("Timestamp") 18 | if not request:add(ts, REQUEST_TIME, read_message("Fields[duration]")) then 19 | return 0 -- outside the buffer 20 | end 21 | 22 | request:add(ts, REQUEST_SIZE, read_message("Fields[size]")) 23 | 24 | if 200 == read_message("Fields[code]") then 25 | request:add(ts, SUCCESS, 1) 26 | else 27 | request:add(ts, FAILURE, 1) 28 | end 29 | 30 | return 0 31 | end 32 | 33 | function timer_event(ns) 34 | local title = "Request Statistics" 35 | inject_message(request:format("cbuf"), title) 36 | inject_message(request:format("cbufd"), title) 37 | end 38 | -------------------------------------------------------------------------------- /monitoring/heka/lua_filters/telemetry_stats_bytes.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | require "circular_buffer" 6 | require "string" 7 | 8 | local title = "Throughput" 9 | local rows = read_config("rows") or 1440 10 | local sec_per_row = read_config("sec_per_row") or 60 11 | local BYTES_READ = 1 12 | local BYTES_WRITTEN = 2 13 | local BYTES_UNCOMPRESSED = 3 14 | 15 | bytes = circular_buffer.new(rows, 3, sec_per_row, true) 16 | bytes:set_header(BYTES_READ , "Bytes Read" , "B") 17 | bytes:set_header(BYTES_WRITTEN , "Bytes Written" , "B") 18 | bytes:set_header(BYTES_UNCOMPRESSED , "Bytes Uncompressed" , "B") 19 | 20 | function process_message () 21 | local ts = read_message("Timestamp") 22 | if not bytes:add(ts, BYTES_READ, read_message("Fields[bytes_read]")) then 23 | return 0 -- outside the buffer 24 | end 25 | 26 | bytes:add(ts, BYTES_WRITTEN, read_message("Fields[bytes_written]")) 27 | bytes:add(ts, BYTES_UNCOMPRESSED, read_message("Fields[bytes_uncompressed]")) 28 | 29 | return 0 30 | end 31 | 32 | function timer_event(ns) 33 | inject_message(bytes:format("cbuf"), title) 34 | inject_message(bytes:format("cbufd"), title) 35 | end 36 | -------------------------------------------------------------------------------- /monitoring/heka/lua_filters/telemetry_stats_errors.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | require "circular_buffer" 6 | require "string" 7 | 8 | local rows = read_config("rows") or 1440 9 | local sec_per_row = read_config("sec_per_row") or 60 10 | local TOTAL_ERRORS = 1 11 | 12 | errors = {} 13 | 14 | local function add_error(name) 15 | local e = circular_buffer.new(rows, 1, sec_per_row, true) 16 | e:set_header(TOTAL_ERRORS, "Total Errors") 17 | errors[name] = e 18 | return e 19 | end 20 | 21 | local f = {type = 0, name = "", value = 0, representation = "", count = 0, key = ""} 22 | 23 | function process_message () 24 | local ts = read_message("Timestamp") 25 | while true do 26 | f.type, f.name, f.value, f.representation, f.count = read_next_field() 27 | if not f.type then break end 28 | 29 | local name = f.name:match("^bad_records\.(%S+)") 30 | if name then 31 | local e = errors[name] 32 | if not e then 33 | e = add_error(name) 34 | end 35 | if not e:add(ts, TOTAL_ERRORS, f.value) then break end -- outside the buffer 36 | end 37 | end 38 | 39 | return 0 40 | end 41 | 42 | function timer_event(ns) 43 | for k, v in pairs(errors) do 44 | inject_message(v:format("cbuf"), k) 45 | inject_message(v:format("cbufd"), k) 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /monitoring/heka/lua_filters/telemetry_stats_records.lua: -------------------------------------------------------------------------------- 1 | -- This Source Code Form is subject to the terms of the Mozilla Public 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | require "circular_buffer" 6 | require "string" 7 | 8 | local rows = read_config("rows") or 1440 9 | local sec_per_row = read_config("sec_per_row") or 60 10 | local RECORDS_READ = 1 11 | local BAD_RECORDS = 2 12 | 13 | loggers = {} 14 | 15 | local function add_channel(logger, channel) 16 | local c = circular_buffer.new(rows, 2, sec_per_row, true) 17 | c:set_header(RECORDS_READ , "Records Read") 18 | c:set_header(BAD_RECORDS , "Bad Records") 19 | logger[channel] = c 20 | return c 21 | end 22 | 23 | function process_message () 24 | local logger = read_message("Logger") 25 | local l = loggers[logger] 26 | if not l then 27 | l = {} 28 | loggers[logger] = l 29 | end 30 | 31 | local channel = read_message("Fields[channel]") 32 | local c = l[channel] 33 | if not c then 34 | c = add_channel(l, channel) 35 | end 36 | 37 | local ts = read_message("Timestamp") 38 | if not c:add(ts, RECORDS_READ, read_message("Fields[records_read]")) then 39 | return 0 -- outside the buffer 40 | end 41 | 42 | c:add(ts, BAD_RECORDS, read_message("Fields[bad_records]")) 43 | 44 | return 0 45 | end 46 | 47 | function timer_event(ns) 48 | for k, v in pairs(loggers) do 49 | for m, n in pairs(v) do 50 | local title = string.format("%s.%s", k, m) 51 | inject_message(n:format("cbuf"), title) 52 | inject_message(n:format("cbufd"), title) 53 | end 54 | end 55 | end 56 | -------------------------------------------------------------------------------- /monitoring/heka/server.toml: -------------------------------------------------------------------------------- 1 | [TelemetryServerInput] 2 | type = "LogstreamerInput" 3 | log_directory = "/mnt/telemetry/log" 4 | file_match = 'telemetry-server\.log' 5 | decoder = "TelemetryServerDecoder" 6 | 7 | [TelemetryServerDecoder] 8 | type = "SandboxDecoder" 9 | script_type = "lua" 10 | filename = "lua_decoders/telemetry_server.lua" 11 | 12 | [TelemetryServerMetrics] 13 | type = "SandboxFilter" 14 | message_matcher = "Type == 'telemetry.server'" 15 | ticker_interval = 60 16 | script_type = "lua" 17 | filename = "lua_filters/telemetry_server_metrics.lua" 18 | preserve_data = true 19 | 20 | [TelemetryChannelMetrics] 21 | type = "SandboxFilter" 22 | message_matcher = "Type == 'telemetry.server'" 23 | ticker_interval = 60 24 | script_type = "lua" 25 | filename = "lua_filters/telemetry_channel_metrics.lua" 26 | preserve_data = true 27 | 28 | [TelemetryChannelMetrics60Days] 29 | type = "SandboxFilter" 30 | message_matcher = "Type == 'telemetry.server'" 31 | ticker_interval = 60 32 | script_type = "lua" 33 | filename = "lua_filters/telemetry_channel_metrics.lua" 34 | preserve_data = true 35 | 36 | [TelemetryChannelMetrics60Days.config] 37 | rows = 1440 38 | sec_per_row = 3600 39 | -------------------------------------------------------------------------------- /monitoring/process_incoming/viz/css/metrics-graphics-demo.css: -------------------------------------------------------------------------------- 1 | #long svg .y-axis line, 2 | #fake_users3 svg .y-axis line { 3 | opacity: 0.1; 4 | } 5 | 6 | #confidence_band svg .x-axis line { 7 | opacity: 0.1; 8 | } -------------------------------------------------------------------------------- /monitoring/sanitize_fxos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/monitoring/sanitize_fxos/__init__.py -------------------------------------------------------------------------------- /monitoring/telemetry.mozilla.org/check_last_update.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import argparse 9 | import sys 10 | from boto.s3.connection import S3Connection 11 | from datetime import datetime, timedelta 12 | 13 | default_date_format = '%a, %d %b %Y %H:%M:%S %Z' 14 | message_template = "s3://{0}/{1} was modified {2} than {3} hours ago: {4}" 15 | 16 | def is_older(target, max_hrs, date_format=default_date_format, verbose=False): 17 | target_date = datetime.strptime(target, date_format) 18 | now_date = datetime.utcnow() 19 | delta = timedelta(hours=(-max_hrs)) 20 | cutoff_date = now_date + delta 21 | if target_date < cutoff_date: 22 | if verbose: 23 | print target_date.strftime(date_format), "<", cutoff_date.strftime(date_format) 24 | return True 25 | if verbose: 26 | print target_date.strftime(date_format), ">=", cutoff_date.strftime(date_format) 27 | return False 28 | 29 | def get_args(argv): 30 | parser = argparse.ArgumentParser(description="Check the last_modified timestamp of an object in S3") 31 | parser.add_argument("-k", "--aws-key", help="AWS Key", default=None) 32 | parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", default=None) 33 | parser.add_argument("-b", "--bucket", required=True, help="S3 bucket name") 34 | parser.add_argument("-p", "--path", required=True, help="S3 object path") 35 | parser.add_argument("-m", "--max-age", help="Threshold for alerting (in hours, default is 24)", type=int, default=24) 36 | parser.add_argument("-f", "--date-format", help="Override the default date format", default=default_date_format) 37 | parser.add_argument("-v", "--verbose", action="store_true", help="Print more detailed output") 38 | args = parser.parse_args(argv) 39 | return args 40 | 41 | def main(argv): 42 | args = get_args(argv) 43 | conn = S3Connection(args.aws_key, args.aws_secret_key) 44 | bucket = conn.get_bucket(args.bucket) 45 | key = bucket.get_key(args.path) 46 | 47 | # File was not modified recently. 48 | if is_older(key.last_modified, args.max_age, args.date_format, args.verbose): 49 | print message_template.format(args.bucket, key.name, "more", 50 | args.max_age, key.last_modified) 51 | return 1 52 | 53 | # File was modified recently. 54 | if args.verbose: 55 | print message_template.format(args.bucket, key.name, "less", 56 | args.max_age, key.last_modified) 57 | return 0 58 | 59 | if __name__ == '__main__': 60 | sys.exit(main(sys.argv[1:])) 61 | -------------------------------------------------------------------------------- /process_incoming/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/process_incoming/__init__.py -------------------------------------------------------------------------------- /process_incoming/worker/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | add_executable(convert ConvertConfig.cpp convert.cpp) 6 | target_link_libraries(convert telemetry) 7 | 8 | add_subdirectory(common) 9 | -------------------------------------------------------------------------------- /process_incoming/worker/ConvertConfig.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | #ifndef mozilla_telemetry_Convert_Config_h 8 | #define mozilla_telemetry_Convert_Config_h 9 | 10 | #include 11 | 12 | namespace mozilla { 13 | namespace telemetry { 14 | 15 | struct ConvertConfig 16 | { 17 | std::string mHekaServer; 18 | std::string mHistogramServer; 19 | boost::filesystem::path mTelemetrySchema; 20 | boost::filesystem::path mStoragePath; 21 | boost::filesystem::path mUploadPath; 22 | uint64_t mMaxUncompressed; 23 | size_t mMemoryConstraint; 24 | int mCompressionPreset; 25 | }; 26 | 27 | /** 28 | * Loads the converter configuration from disk. 29 | * 30 | * @param aFile Filename containing the JSON configuration. 31 | * @param aConfig Structure to populate with the configuration. 32 | */ 33 | void ReadConfig(const char* aFile, ConvertConfig& aConfig); 34 | 35 | } 36 | } 37 | 38 | #endif // mozilla_telemetry_Convert_Config_h 39 | -------------------------------------------------------------------------------- /process_incoming/worker/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | set(TELEMETRY_SRC 6 | TelemetryConstants.cpp 7 | HistogramSpecification.cpp 8 | HistogramCache.cpp 9 | HistogramConverter.cpp 10 | TelemetryRecord.cpp 11 | TelemetrySchema.cpp 12 | RecordWriter.cpp 13 | CompressedFileWriter.cpp 14 | Metric.cpp 15 | message.pb.cc 16 | HekaLogger.cpp) 17 | 18 | add_library(telemetry STATIC ${TELEMETRY_SRC}) 19 | add_dependencies(telemetry rapidjson-0_11) 20 | target_link_libraries(telemetry 21 | ${Boost_LIBRARIES} 22 | ${PROTOBUF_LIBRARIES} 23 | ${ZLIB_LIBRARIES} 24 | ${OPENSSL_LIBRARIES} 25 | ${CMAKE_THREAD_LIBS_INIT} 26 | ${LZMA_LIBRARIES} 27 | -lrt) 28 | 29 | INCLUDE(CPack) 30 | 31 | configure_file(TelemetryConstants.in.cpp ${CMAKE_CURRENT_BINARY_DIR}/TelemetryConstants.cpp) 32 | 33 | add_subdirectory(test) 34 | -------------------------------------------------------------------------------- /process_incoming/worker/common/Common.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | #ifndef mozilla_common_h 8 | #define mozilla_common_h 9 | 10 | #include 11 | 12 | typedef rapidjson::GenericDocument, rapidjson::CrtAllocator> RapidjsonDocument; 13 | typedef rapidjson::GenericValue, rapidjson::CrtAllocator> RapidjsonValue; 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /process_incoming/worker/common/CompressedFileWriter.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | #ifndef CompressedFileWriter_h 8 | #define CompressedFileWriter_h 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | /** Buffer output buffer size, before writing to file */ 17 | #define BUF_SIZE BUFSIZ 18 | 19 | namespace mozilla { 20 | namespace telemetry { 21 | 22 | /** 23 | * Compressed Wrapper class that writes data to an compressed XZ file 24 | * This is essentially LZMA2, XZ docs says not use LZMA1 unless you know what 25 | * you're doing. 26 | */ 27 | class CompressedFileWriter 28 | { 29 | public: 30 | /** Create CompressedFileWriter */ 31 | CompressedFileWriter(); 32 | 33 | /** 34 | * Initialize CompressedFileWriter given an LZMA compression level, a number 35 | * between 0 and 9. 36 | * See preset option in xz(1) for more details. 37 | */ 38 | bool Initialize(FILE *aFile, uint32_t aPreset = 0); 39 | 40 | /** Write buffer to compressed file */ 41 | bool Write(const char* aBuffer, size_t aSize, size_t *aCompressedSize = nullptr); 42 | 43 | /** Finalize compression */ 44 | bool Finalize(size_t *aCompressedSize = nullptr); 45 | 46 | ~CompressedFileWriter(); 47 | private: 48 | FILE* mFile; 49 | lzma_stream mStream; 50 | char mBuffer[BUF_SIZE]; 51 | }; 52 | 53 | } // namespace telemetry 54 | } // namespace mozilla 55 | 56 | #endif // CompressedFileWriter_h 57 | -------------------------------------------------------------------------------- /process_incoming/worker/common/HekaLogger.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | /// @brief Heka logger implementation @file 8 | 9 | #include "HekaLogger.h" 10 | 11 | #include 12 | 13 | using boost::asio::ip::tcp; 14 | 15 | namespace mozilla { 16 | namespace telemetry { 17 | 18 | //////////////////////////////////////////////////////////////////////////////// 19 | HekaLogger::HekaLogger() : mSocket(mIo) { } 20 | 21 | //////////////////////////////////////////////////////////////////////////////// 22 | bool HekaLogger::Connect(const std::string& aHeka) 23 | { 24 | if (mSocket.is_open()) { 25 | mSocket.close(); 26 | } 27 | 28 | size_t pos = aHeka.find(':'); 29 | std::string host = aHeka.substr(0, pos); 30 | std::string port; 31 | if (pos != std::string::npos) { 32 | port = aHeka.substr(pos + 1); 33 | } else { 34 | port = "5565"; 35 | } 36 | try { 37 | boost::asio::ip::tcp::resolver resolver(mIo); 38 | boost::asio::ip::tcp::resolver::query query(host, port); 39 | boost::asio::ip::tcp::resolver::iterator end, i = resolver.resolve(query); 40 | if (end == boost::asio::connect(mSocket, i)) { 41 | return false; 42 | } 43 | } 44 | catch (...) { 45 | return false; 46 | } 47 | return true; 48 | } 49 | 50 | //////////////////////////////////////////////////////////////////////////////// 51 | void HekaLogger::Disconnect() 52 | { 53 | mSocket.close(); 54 | } 55 | 56 | //////////////////////////////////////////////////////////////////////////////// 57 | bool HekaLogger::Write(boost::asio::streambuf& sb) 58 | { 59 | if (!mSocket.is_open()) { 60 | return false; 61 | } 62 | 63 | try { 64 | write(mSocket, sb); 65 | } 66 | catch (...) { 67 | return false; 68 | } 69 | return true; 70 | } 71 | 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /process_incoming/worker/common/HekaLogger.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | /** @file Writes log message to Heka via TCP */ 8 | 9 | #ifndef mozilla_telemetry_Heka_Logger_h 10 | #define mozilla_telemetry_Heka_Logger_h 11 | 12 | #include 13 | #include 14 | 15 | namespace mozilla { 16 | namespace telemetry { 17 | 18 | class HekaLogger 19 | { 20 | public: 21 | HekaLogger(); 22 | 23 | /** 24 | * Connects the logger to a Heka instance. 25 | * 26 | * @param aHeka Hostname:port 27 | * 28 | * @return bool True if a connection could be established. 29 | */ 30 | bool Connect(const std::string& aHeka); 31 | 32 | /** 33 | * Closes the connect to the Heka server. 34 | */ 35 | void Disconnect(); 36 | 37 | /** 38 | * Writes the data to the Heka server. 39 | * 40 | * @param sb Stream buffer containing the data to output. 41 | * 42 | * @return bool True if the data was successfully written to Heka. 43 | */ 44 | bool Write(boost::asio::streambuf& sb); 45 | 46 | bool operator()() 47 | { 48 | return mSocket.is_open(); 49 | } 50 | 51 | private: 52 | boost::asio::io_service mIo; 53 | boost::asio::ip::tcp::socket mSocket; 54 | }; 55 | 56 | } 57 | } 58 | 59 | #endif // mozilla_telemetry_Heka_Logger_h 60 | 61 | -------------------------------------------------------------------------------- /process_incoming/worker/common/HistogramConverter.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | /// Histogram data converter. @file 8 | 9 | #ifndef mozilla_telemetry_Histogram_Converter_h 10 | #define mozilla_telemetry_Histogram_Converter_h 11 | 12 | #include "HistogramCache.h" 13 | 14 | #include 15 | 16 | namespace mozilla { 17 | namespace telemetry { 18 | 19 | bool ConvertHistogramData(HistogramCache& aCache, RapidjsonDocument& aDoc); 20 | 21 | } 22 | } 23 | 24 | #endif // mozilla_telemetry_Histogram_Converter_h 25 | -------------------------------------------------------------------------------- /process_incoming/worker/common/Logger.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | #ifndef mozilla_telemetry_logger_h 8 | #define mozilla_telemetry_logger_h 9 | 10 | #include 11 | #include 12 | 13 | #define __SHORT_FORM_OF_FILE__ \ 14 | (strrchr(__FILE__,'/') \ 15 | ? strrchr(__FILE__,'/')+1 \ 16 | : __FILE__ \ 17 | ) 18 | 19 | #define LOGGER(level) BOOST_LOG_TRIVIAL(level) << __FUNCTION__ << " @ " << __SHORT_FORM_OF_FILE__ << ":" << __LINE__ << " - " 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /process_incoming/worker/common/Metric.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | /// @brief Metric implementation @file 8 | 9 | #include "Metric.h" 10 | #include "TelemetryConstants.h" 11 | 12 | #include 13 | 14 | namespace mozilla { 15 | namespace telemetry { 16 | 17 | //////////////////////////////////////////////////////////////////////////////// 18 | void 19 | ConstructField(message::Message& aMsg, Metric& aMetric) 20 | { 21 | auto f = aMsg.add_fields(); 22 | f->set_name(aMetric.mName); 23 | f->set_representation(aMetric.mRepresentation); 24 | f->set_value_type(message::Field_ValueType_DOUBLE); 25 | f->add_value_double(aMetric.mValue); 26 | } 27 | 28 | //////////////////////////////////////////////////////////////////////////////// 29 | void 30 | WriteMessage(std::ostream& os, message::Message& aMsg) 31 | { 32 | if (!os) return; 33 | 34 | message::Header h; 35 | h.set_message_length(aMsg.ByteSize()); 36 | os.put(kRecordSeparator); 37 | os.put(h.ByteSize()); 38 | h.SerializeToOstream(&os); 39 | os.put(kUnitSeparator); 40 | aMsg.SerializeToOstream(&os); 41 | } 42 | 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /process_incoming/worker/common/Metric.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | /** @file 8 | Generic structure for tracking runtime statistics. 9 | */ 10 | 11 | #ifndef mozilla_telemetry_Metric_h 12 | #define mozilla_telemetry_Metric_h 13 | 14 | #include "message.pb.h" 15 | 16 | #include 17 | 18 | namespace mozilla { 19 | namespace telemetry { 20 | 21 | struct Metric 22 | { 23 | Metric(std::string aName, std::string aRepresentation = "count") : 24 | mName(aName), 25 | mRepresentation(aRepresentation), 26 | mValue(0) { } 27 | 28 | std::string mName; 29 | std::string mRepresentation; 30 | double mValue; 31 | }; 32 | 33 | /** 34 | * Helper function to ture a Metric struct into a Heka message field. 35 | * 36 | * @param aMsg Heka protobuf message to add the field to. 37 | * @param aMetric Metric to be converted to a field. 38 | */ 39 | void ConstructField(message::Message &aMsg, Metric& aMetric); 40 | 41 | /** 42 | * Writes a Heka protobuf message with proper framing for stream output. 43 | * 44 | * @param os Output stream receiving the message. 45 | * @param aMsg Message to be framed, encoded, and written. 46 | */ 47 | void WriteMessage(std::ostream &os, message::Message &aMsg); 48 | 49 | } 50 | } 51 | 52 | #endif // mozilla_telemetry_Metric_h 53 | -------------------------------------------------------------------------------- /process_incoming/worker/common/TelemetryConstants.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | /// Mozilla Telemetry global constants @file 8 | 9 | #ifndef mozilla_telemetry_Telemetry_Constants_h 10 | #define mozilla_telemetry_Telemetry_Constants_h 11 | 12 | #include 13 | 14 | namespace mozilla { 15 | namespace telemetry { 16 | extern const unsigned kVersionMajor; 17 | extern const unsigned kVersionMinor; 18 | extern const unsigned kVersionPatch; 19 | 20 | extern const std::string kProgramName; 21 | extern const std::string kProgramDescription; 22 | 23 | extern const size_t kMaxTelemetryPath; 24 | extern const size_t kMaxTelemetryData; 25 | 26 | extern const char kRecordSeparator; 27 | extern const char kUnitSeparator; 28 | 29 | extern const size_t kExtraBucketsSize; 30 | extern const char* kExtraBuckets[]; 31 | } 32 | } 33 | 34 | #endif // mozilla_telemetry_Telemetry_Constants_h 35 | -------------------------------------------------------------------------------- /process_incoming/worker/common/TelemetryConstants.in.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | /// @brief Prevent duplication of string constants between compilation units @file 8 | 9 | #include "@CMAKE_CURRENT_SOURCE_DIR@/TelemetryConstants.h" 10 | 11 | namespace mozilla { 12 | namespace telemetry { 13 | 14 | const unsigned kVersionMajor = @CPACK_PACKAGE_VERSION_MAJOR@; 15 | const unsigned kVersionMinor = @CPACK_PACKAGE_VERSION_MINOR@; 16 | const unsigned kVersionPatch = @CPACK_PACKAGE_VERSION_PATCH@; 17 | 18 | const std::string kProgramName("@PROJECT_NAME@"); 19 | const std::string kProgramDescription("@CPACK_PACKAGE_DESCRIPTION_SUMMARY@"); 20 | 21 | const size_t kMaxTelemetryPath = 10 * 1024; 22 | const size_t kMaxTelemetryData = 200 * 1024; 23 | 24 | const char kRecordSeparator = 0x1e; 25 | const char kUnitSeparator = 0x1f; 26 | 27 | const size_t kExtraBucketsSize = 5; 28 | const char* kExtraBuckets[] = { "sum", "log_sum", "log_sum_squares", 29 | "sum_squares_lo", "sum_squares_hi", nullptr }; 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /process_incoming/worker/common/TelemetryRecord.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | /// Telemetry record. @file 8 | 9 | #ifndef mozilla_telemetry_Telemetry_Record_h 10 | #define mozilla_telemetry_Telemetry_Record_h 11 | 12 | #include "Common.h" 13 | #include "Metric.h" 14 | #include "TelemetryConstants.h" 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | namespace mozilla { 22 | namespace telemetry { 23 | 24 | template 25 | std::istream& read_value(std::istream& aInput, T& val) 26 | { 27 | aInput.read((char*)&val, sizeof(val)); 28 | return aInput; 29 | } 30 | 31 | class TelemetryRecord : boost::noncopyable 32 | { 33 | public: 34 | TelemetryRecord(); 35 | ~TelemetryRecord(); 36 | 37 | bool Read(std::istream& aInput); 38 | 39 | const char* GetPath(); 40 | uint64_t GetTimestamp(); 41 | RapidjsonDocument& GetDocument(); 42 | 43 | /** 44 | * Rolls up the internal metric data into the fields element of the provided 45 | * message. The metrics are reset after each call. 46 | * 47 | * @param aMsg The message fields element will be cleared and then populated 48 | * with the TelemetryRecord metrics. 49 | */ 50 | void GetMetrics(message::Message& aMsg); 51 | 52 | private: 53 | struct Metrics { 54 | Metrics() : 55 | mInvalidPathLength("Invalid Path Length"), 56 | mInvalidDataLength("Invalid Data Length"), 57 | mInflateFailures("Inflate Failures"), 58 | mParseFailures("Parse Failures"), 59 | mCorruptData("Corrupt Data", "B") { } 60 | 61 | Metric mInvalidPathLength; 62 | Metric mInvalidDataLength; 63 | Metric mInflateFailures; 64 | Metric mParseFailures; 65 | Metric mCorruptData; 66 | }; 67 | 68 | bool FindRecord(std::istream& aInput); 69 | bool ReadHeader(std::istream& aInput); 70 | bool ProcessRecord(); 71 | int Inflate(); 72 | 73 | RapidjsonDocument mDocument; 74 | 75 | uint16_t mPathLength; 76 | size_t mPathSize; 77 | char* mPath; 78 | 79 | uint32_t mDataLength; 80 | size_t mDataSize; 81 | char* mData; 82 | 83 | uint64_t mTimestamp; 84 | 85 | uint32_t mInflateLength; 86 | size_t mInflateSize; 87 | char* mInflate; 88 | 89 | Metrics mMetrics; 90 | 91 | }; 92 | 93 | } 94 | } 95 | 96 | #endif // mozilla_telemetry_Telemetry_Record_h 97 | -------------------------------------------------------------------------------- /process_incoming/worker/common/message.proto: -------------------------------------------------------------------------------- 1 | package message; 2 | 3 | message Header { 4 | enum HmacHashFunction { 5 | MD5 = 0; 6 | SHA1 = 1; 7 | } 8 | required uint32 message_length = 1; // length in bytes 9 | 10 | optional HmacHashFunction hmac_hash_function = 3 [default = MD5]; 11 | optional string hmac_signer = 4; 12 | optional uint32 hmac_key_version = 5; 13 | optional bytes hmac = 6; 14 | } 15 | 16 | message Field { 17 | enum ValueType { 18 | STRING = 0; 19 | BYTES = 1; 20 | INTEGER = 2; 21 | DOUBLE = 3; 22 | BOOL = 4; 23 | } 24 | required string name = 1; 25 | optional ValueType value_type = 2 [default = STRING]; 26 | optional string representation = 3; 27 | repeated string value_string = 4; 28 | repeated bytes value_bytes = 5; 29 | repeated int64 value_integer = 6 [packed=true]; 30 | repeated double value_double = 7 [packed=true]; 31 | repeated bool value_bool = 8 [packed=true]; 32 | } 33 | 34 | message Message { 35 | required bytes uuid = 1; 36 | required int64 timestamp = 2; // nanoseconds since UNIX epoch 37 | optional string type = 3; 38 | optional string logger = 4; 39 | optional int32 severity = 5 [default = 7]; 40 | optional string payload = 6; 41 | optional string env_version = 7; 42 | optional int32 pid = 8; 43 | optional string hostname = 9; 44 | repeated Field fields = 10; 45 | } 46 | -------------------------------------------------------------------------------- /process_incoming/worker/common/test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | add_executable(TestHistogramSpecification TestHistogramSpecification.cpp) 6 | target_link_libraries(TestHistogramSpecification telemetry ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY}) 7 | ADD_TEST(TestHistogramSpecification TestHistogramSpecification) 8 | 9 | add_executable(TestHistogramCache TestHistogramCache.cpp) 10 | target_link_libraries(TestHistogramCache telemetry ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY}) 11 | ADD_TEST(TestHistogramCache TestHistogramCache) 12 | 13 | add_executable(TestHistogramConverter TestHistogramConverter.cpp) 14 | target_link_libraries(TestHistogramConverter telemetry ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY}) 15 | ADD_TEST(TestHistogramConverter TestHistogramConverter) 16 | 17 | add_executable(TestTelemetryRecord TestTelemetryRecord.cpp) 18 | target_link_libraries(TestTelemetryRecord telemetry ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY}) 19 | ADD_TEST(TestTelemetryRecord TestTelemetryRecord) 20 | 21 | add_executable(TestTelemetrySchema TestTelemetrySchema.cpp) 22 | target_link_libraries(TestTelemetrySchema telemetry ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY}) 23 | ADD_TEST(TestTelemetrySchema TestTelemetrySchema) 24 | 25 | add_executable(TestRecordWriter TestRecordWriter.cpp) 26 | target_link_libraries(TestRecordWriter telemetry ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY}) 27 | ADD_TEST(TestRecordWriter TestRecordWriter) 28 | 29 | configure_file (${CMAKE_CURRENT_SOURCE_DIR}/TestConfig.in.h ${CMAKE_CURRENT_BINARY_DIR}/TestConfig.h) 30 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 31 | -------------------------------------------------------------------------------- /process_incoming/worker/common/test/TestConfig.in.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | #ifndef mozilla_telemetry_Test_Config_h 8 | #define mozilla_telemetry_Test_Config_h 9 | #include 10 | 11 | const std::string kDataPath("${CMAKE_CURRENT_SOURCE_DIR}/data/"); 12 | 13 | #endif // mozilla_telemetry_Test_Config_h 14 | -------------------------------------------------------------------------------- /process_incoming/worker/common/test/TestHistogramCache.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | #define BOOST_TEST_MODULE TestHistogramCache 8 | #include 9 | #include "TestConfig.h" 10 | #include "../HistogramCache.h" 11 | 12 | using namespace std; 13 | using namespace mozilla::telemetry; 14 | 15 | BOOST_AUTO_TEST_CASE(test_valid) 16 | { 17 | HistogramCache cache("localhost:9898"); 18 | auto h = cache.FindHistogram("https://hg.mozilla.org/releases/mozilla-release/rev/a55c55edf302"); 19 | BOOST_REQUIRE(h); 20 | } 21 | 22 | BOOST_AUTO_TEST_CASE(test_unknown_revision) 23 | { 24 | HistogramCache cache("localhost:9898"); 25 | auto h = cache.FindHistogram("https://hg.mozilla.org/releases/mozilla-release/rev/f55c55edf302"); 26 | BOOST_REQUIRE(!h); 27 | } 28 | 29 | BOOST_AUTO_TEST_CASE(test_invalid_revision) 30 | { 31 | HistogramCache cache("localhost:9898"); 32 | auto h = cache.FindHistogram("missing"); 33 | BOOST_REQUIRE(!h); 34 | } 35 | -------------------------------------------------------------------------------- /process_incoming/worker/common/test/TestHistogramConverter.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | #define BOOST_TEST_MODULE TestHistogramConverter 8 | #include 9 | #include "TestConfig.h" 10 | #include "../HistogramConverter.h" 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | using namespace std; 17 | using namespace mozilla::telemetry; 18 | 19 | BOOST_AUTO_TEST_CASE(test_converter) 20 | { 21 | const char* hist = "{\"ver\":1,\"histograms\":{\"A11Y_IATABLE_USAGE_FLAG\":{\"range\":[1,2],\"bucket_count\":3,\"histogram_type\":3,\"values\":{\"0\":1,\"1\":0},\"sum\":4984161763,\"sum_squares_lo\":1.23415,\"sum_squares_hi\":1.01}},\"info\":{\"revision\":\"https://hg.mozilla.org/releases/mozilla-release/rev/a55c55edf302\"}}"; 22 | 23 | const char* conv = "{\"ver\":2,\"histograms\":{\"A11Y_IATABLE_USAGE_FLAG\":[1,0,0,4984161763,-1,-1,1.23415,1.01]},\"info\":{\"revision\":\"https://hg.mozilla.org/releases/mozilla-release/rev/a55c55edf302\"}}"; 24 | 25 | RapidjsonDocument d; 26 | d.Parse<0>(hist); 27 | BOOST_REQUIRE(!d.HasParseError()); 28 | 29 | HistogramCache cache("localhost:9898"); 30 | BOOST_REQUIRE_EQUAL(true, ConvertHistogramData(cache, d)); 31 | rapidjson::StringBuffer sb; 32 | rapidjson::Writer writer(sb); 33 | d.Accept(writer); 34 | BOOST_REQUIRE_EQUAL(conv, sb.GetString()); 35 | } 36 | -------------------------------------------------------------------------------- /process_incoming/worker/common/test/TestRecordWriter.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | #define BOOST_TEST_MODULE TestRecordWriter 8 | 9 | #include "TestConfig.h" 10 | #include "../RecordWriter.h" 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | using namespace std; 19 | using namespace mozilla::telemetry; 20 | 21 | namespace fs = boost::filesystem; 22 | 23 | BOOST_AUTO_TEST_CASE(test_converter) 24 | { 25 | fs::path workDir = ".work"; 26 | fs::path uploadDir = ".upload"; 27 | 28 | BOOST_REQUIRE(!fs::exists(workDir)); 29 | BOOST_REQUIRE(!fs::exists(uploadDir)); 30 | fs::create_directory(workDir); 31 | fs::create_directory(uploadDir); 32 | 33 | RecordWriter writer(workDir.string(), uploadDir.string(), 1048576, 1000, 0); 34 | string payload = "431ab5c3-2712-4ab7-a4b6-e9b61f3a1f30 {\"ver\":2,\"histograms\":{\"A11Y_IATABLE_USAGE_FLAG\":[1,0,0,0,-1,-1,1.23415,1.01]},\"info\":{\"revision\":\"https://hg.mozilla.org/releases/mozilla-release/rev/a55c55edf302\"}}"; 35 | string prefix = "output"; 36 | writer.Write(prefix, payload.c_str(), payload.size() + 1); 37 | writer.Finalize(); 38 | BOOST_REQUIRE(fs::is_empty(workDir)); 39 | BOOST_REQUIRE(!fs::is_empty(uploadDir)); 40 | 41 | fs::directory_iterator it(uploadDir); 42 | fs::path generated = it->path(); 43 | 44 | string filename = generated.leaf().string(); 45 | boost::regex reg(prefix + "\\.v2\\.log\\.[0-9a-f]{32}\\.xz"); 46 | BOOST_REQUIRE(regex_match(filename.begin(), filename.end(), reg)); 47 | 48 | string command = "xz -d " + generated.string(); 49 | BOOST_REQUIRE(system(command.c_str()) == 0); 50 | 51 | fs::path decompressed = generated.replace_extension(); 52 | ifstream decompressedFile(decompressed.string()); 53 | string line; 54 | BOOST_REQUIRE(getline(decompressedFile, line, '\0')); 55 | BOOST_REQUIRE_EQUAL(line, payload); 56 | 57 | fs::remove_all(workDir); 58 | fs::remove_all(uploadDir); 59 | } 60 | -------------------------------------------------------------------------------- /process_incoming/worker/common/test/TestTelemetryRecord.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* vim: set ts=2 et sw=2 tw=80: */ 3 | /* This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 | 7 | #define BOOST_TEST_MODULE TestTelemetryRecord 8 | #include 9 | #include "TestConfig.h" 10 | #include "../TelemetryRecord.h" 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | 18 | #include 19 | 20 | using namespace std; 21 | using namespace mozilla::telemetry; 22 | 23 | static const string rec("\x1e\x04\x00\x07\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00" "abcd{\"a\":8}", 26); 24 | 25 | BOOST_AUTO_TEST_CASE(test_read) 26 | { 27 | string data(rec + rec); 28 | istringstream iss(data); 29 | TelemetryRecord tr; 30 | for (int i = 0; i < 2; ++i) { 31 | BOOST_REQUIRE_EQUAL(true, tr.Read(iss)); 32 | BOOST_REQUIRE_EQUAL(1, tr.GetTimestamp()); 33 | BOOST_REQUIRE_EQUAL("abcd", tr.GetPath()); 34 | BOOST_REQUIRE_EQUAL(8, tr.GetDocument()["a"].GetInt()); 35 | } 36 | } 37 | 38 | BOOST_AUTO_TEST_CASE(test_exceed_pathlength) 39 | { 40 | string data(rec + string("\x1e\xff\xff\x07\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00", 15) + rec); 41 | istringstream iss(data); 42 | TelemetryRecord tr; 43 | for (int i = 0; i < 2; ++i) { 44 | BOOST_REQUIRE_EQUAL(true, tr.Read(iss)); 45 | BOOST_REQUIRE_EQUAL(1, tr.GetTimestamp()); 46 | BOOST_REQUIRE_EQUAL("abcd", tr.GetPath()); 47 | BOOST_REQUIRE_EQUAL(8, tr.GetDocument()["a"].GetInt()); 48 | } 49 | } 50 | 51 | BOOST_AUTO_TEST_CASE(test_short_pathlength) 52 | { 53 | string bad_rec("\x1e\x02\x00\x07\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00" "abcd{\"a\":8}", 26); 54 | string data(bad_rec + rec); 55 | istringstream iss(data); 56 | TelemetryRecord tr; 57 | 58 | BOOST_REQUIRE_EQUAL(true, tr.Read(iss)); 59 | BOOST_REQUIRE_EQUAL(1, tr.GetTimestamp()); 60 | BOOST_REQUIRE_EQUAL("abcd", tr.GetPath()); 61 | BOOST_REQUIRE_EQUAL(8, tr.GetDocument()["a"].GetInt()); 62 | 63 | BOOST_REQUIRE_EQUAL(false, tr.Read(iss)); 64 | } 65 | 66 | //BOOST_AUTO_TEST_CASE(test_large_file) 67 | //{ 68 | // ifstream file(kDataPath + "../../../../telemetry.log", ios_base::binary); 69 | // TelemetryRecord tr; 70 | // int cnt = 0; 71 | // while (tr.Read(file)) { 72 | // ++cnt; 73 | // } 74 | // BOOST_REQUIRE_EQUAL(7331, cnt); 75 | //} 76 | -------------------------------------------------------------------------------- /process_incoming/worker/common/test/data/invalid.json: -------------------------------------------------------------------------------- 1 | invalid json data 2 | -------------------------------------------------------------------------------- /process_incoming/worker/common/test/data/invalid_kind.json: -------------------------------------------------------------------------------- 1 | {"histograms":{"MY_HISTOGRAM": { "kind":"bogus"}}} 2 | -------------------------------------------------------------------------------- /process_incoming/worker/common/test/data/invalid_schema.json: -------------------------------------------------------------------------------- 1 | ["a"] 2 | -------------------------------------------------------------------------------- /process_incoming/worker/common/test/data/missing_kind.json: -------------------------------------------------------------------------------- 1 | {"histograms":{"MY_HISTOGRAM":{"test":"1"}}} 2 | -------------------------------------------------------------------------------- /process_incoming/worker/common/test/data/telemetry1.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/process_incoming/worker/common/test/data/telemetry1.log -------------------------------------------------------------------------------- /process_incoming/worker/common/test/data/telemetry_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "submission_date", 6 | "allowed_values": "*" 7 | }, 8 | { 9 | "field_name": "reason", 10 | "allowed_values": ["idle-daily","saved-session"] 11 | }, 12 | { 13 | "field_name": "appName", 14 | "allowed_values": ["Firefox","Fennec","Thunderbird"] 15 | }, 16 | { 17 | "field_name": "appUpdateChannel", 18 | "allowed_values": ["release", "beta", "nightly", "aurora"] 19 | }, 20 | { 21 | "field_name": "appVersion", 22 | "allowed_values": "*" 23 | }, 24 | { 25 | "field_name": "appBuildID", 26 | "allowed_values": "*" 27 | }, 28 | { 29 | "field_name": "memsize", 30 | "allowed_values": {"min":448, "max":500} 31 | } 32 | ] 33 | } 34 | 35 | -------------------------------------------------------------------------------- /process_incoming/worker/convert.json: -------------------------------------------------------------------------------- 1 | { 2 | "heka_server": "localhost:5565", 3 | "telemetry_schema": "telemetry_schema.json", 4 | "histogram_server": "localhost:9898", 5 | "storage_path": "./storage", 6 | "upload_path": "./upload", 7 | "max_uncompressed": 1048576, 8 | "memory_constraint": 1000, 9 | "compression_preset": 0 10 | } 11 | -------------------------------------------------------------------------------- /provisioning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/provisioning/__init__.py -------------------------------------------------------------------------------- /provisioning/ansible/README.md: -------------------------------------------------------------------------------- 1 | Deploying telemetry-analysis 2 | ============================ 3 | 4 | ## Manual setup tasks: 5 | 6 | - In the [AWS SES console](https://us-west-2.console.aws.amazon.com/ses/home?region=us-west-2), make sure that the email address "telemetry-alerts@mozilla.com" is verified. 7 | - Make sure cross IAM S3 permissions are set up if cross-IAM access is required. Edit bucket policies for relevant buckets to look something like this: 8 | ```json 9 | { 10 | "Version": "2008-10-17", 11 | "Statement": [ 12 | { 13 | "Sid": "ListAccess", 14 | "Effect": "Allow", 15 | "Principal": { 16 | "AWS": [ 17 | "arn:aws:iam::XXXXXXXXXXXX:root" 18 | ] 19 | }, 20 | "Action": "S3:ListBucket", 21 | "Resource": "arn:aws:s3:::telemetry-published-v2" 22 | }, 23 | { 24 | "Sid": "GetAccess", 25 | "Effect": "Allow", 26 | "Principal": { 27 | "AWS": [ 28 | "arn:aws:iam::XXXXXXXXXXXX:root" 29 | ] 30 | }, 31 | "Action": "S3:GetObject", 32 | "Resource": "arn:aws:s3:::telemetry-published-v2/*" 33 | } 34 | ] 35 | } 36 | ``` 37 | 38 | ## Automated deployment tasks: 39 | 40 | - Build an AMI for telemetry workers: 41 | ```bash 42 | ansible-playbook -i hosts -v --extra-vars "@envs/dev.yml" playbooks/build_ami.yml 43 | ``` 44 | - Set `worker_ami_id` in [`envs/dev.yml`](envs/dev.yml) to the value output by the above command. This a git-managed file. 45 | - Set the RDS password in `envs/dev_secrets.yml`. See [`envs/dev_secrets.example.yml`](envs/dev_secrets.example.yml) for an example. This is an un-managed file. If the telemetry-analysis resources stack has already been created, the value you should set this to is the password portion of the URL. 46 | - Create the static resources Cloudformation template (only needs to be run once): 47 | ```bash 48 | ansible-playbook -i hosts -v --extra-vars "@envs/dev.yml" --extra-vars "@envs/dev_secrets.yml" playbooks/resources.yml 49 | ``` 50 | 51 | ## To update / deploy the application servers: 52 | 53 | - Create a new code package to use by updating `sources_version` in [`envs/dev.yml`](envs/dev.yml) and running: 54 | ```bash 55 | ansible-playbook -i hosts -v --extra-vars "@envs/dev.yml" playbooks/make_code_package.yml 56 | ``` 57 | - Deploy the CloudFormation template by running: 58 | ```bash 59 | ansible-playbook -i hosts -v --extra-vars "@envs/dev.yml" playbooks/app.yml 60 | ``` 61 | - Deploy user-facing DNS with (only needs to be run once): 62 | ```bash 63 | ansible-playbook -i hosts -v --extra-vars "@envs/dev.yml" playbooks/route53.yaml 64 | ``` 65 | -------------------------------------------------------------------------------- /provisioning/ansible/envs/dev.yml: -------------------------------------------------------------------------------- 1 | stack_name: telemetry-analysis 2 | region: us-west-2 3 | env: dev 4 | key_name: "20151209-cloudservices-aws-ssh-dev" 5 | ssl_cert_arn: arn:aws:iam::927034868273:server-certificate/exp20180829_star_telemetry_mozilla_org 6 | 7 | # code version 8 | sources_version: 31 9 | 10 | dns_name: "telemetry-analysis.dev.mozaws.net" 11 | dns_zone_name: "dev.mozaws.net." 12 | public_analysis_dns_name: "telemetry-analysis-output.dev.mozaws.net" 13 | 14 | instance_type: t2.medium 15 | server_ami_id: ami-a40bea97 16 | 17 | # this value can be updated using the build_ami playbook 18 | worker_ami_id: ami-db8067bb 19 | 20 | spark_instance_profile: telemetry-spark-cloudformation-TelemetrySparkInstanceProfile-1SATUBVEXG7E3 21 | spark_emr_bucket: telemetry-spark-emr-2 22 | -------------------------------------------------------------------------------- /provisioning/ansible/envs/dev_secrets.example.yml: -------------------------------------------------------------------------------- 1 | # This value should be populated from the output of 2 | # the telemetry-analysis-resources if it already exists. 3 | database_password: 'but does it achieve the scale of the web?' 4 | server_secret: 'yes' 5 | -------------------------------------------------------------------------------- /provisioning/ansible/hosts: -------------------------------------------------------------------------------- 1 | localhost 2 | -------------------------------------------------------------------------------- /provisioning/ansible/playbooks/app.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - include: resources.yml 3 | 4 | - hosts: localhost 5 | connection: local 6 | tasks: 7 | 8 | - name: get top level git dir 9 | command: 'git rev-parse --show-toplevel' 10 | register: top_level_dir 11 | 12 | - name: make resources template 13 | command: make analysis-service-stack.json 14 | args: 15 | chdir: "{{top_level_dir.stdout}}/http/analysis-service" 16 | 17 | - name: resources 18 | cloudformation: stack_name="{{stack_name}}-app" region={{region}} state=present 19 | args: 20 | template: "{{top_level_dir.stdout}}/http/analysis-service/analysis-service-stack.json" 21 | tags: 22 | App: "telemetry" 23 | Env: "{{env}}" 24 | Stack: "{{stack_name}}" 25 | template_parameters: 26 | ServerInstanceType: "{{instance_type}}" 27 | AnalysisSourcesBucket: "{{resources_cfn.stack_outputs.AnalysisSourcesBucket}}" 28 | AnalysisPrivateDataBucket: "{{resources_cfn.stack_outputs.AnalysisPrivateDataBucket}}" 29 | AnalysisDBSecurityGroup: "{{resources_cfn.stack_outputs.AnalysisDBSecurityGroup}}" 30 | AnalysisDatabaseURL: "{{resources_cfn.stack_outputs.AnalysisDatabaseURL}}" 31 | AnalysisPublicDataBucket: "{{resources_cfn.stack_outputs.AnalysisPublicDataBucket}}" 32 | AnalysisTemporaryBucket: "{{resources_cfn.stack_outputs.AnalysisTemporaryBucket}}" 33 | AnalysisLoadBalancer: "{{resources_cfn.stack_outputs.AnalysisLoadBalancer}}" 34 | ServerSecret: "{{server_secret}}" 35 | KeyName: "{{key_name}}" 36 | WorkerAMI: "{{worker_ami_id}}" 37 | ServerAMI: "{{server_ami_id}}" 38 | SourcesVersion: "{{sources_version}}" 39 | AnalysisPublicWorkerProfile: "{{resources_cfn.stack_outputs.AnalysisPublicWorkerProfile}}" 40 | AnalysisPrivateWorkerProfile: "{{resources_cfn.stack_outputs.AnalysisPrivateWorkerProfile}}" 41 | SparkEMRBucket: "{{spark_emr_bucket}}" 42 | SparkInstanceProfile: "{{spark_instance_profile}}" 43 | # ignore roles 44 | # AnalysisPublicWorkerRole: "{{resources_cfn.stack_outputs.AnalysisPublicWorkerRole}}" 45 | # AnalysisPrivateWorkerRole: "{{resources_cfn.stack_outputs.AnalysisPrivateWorkerRole}}" 46 | -------------------------------------------------------------------------------- /provisioning/ansible/playbooks/build_ami.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: localhost 3 | connection: local 4 | gather_facts: false 5 | 6 | tasks: 7 | - name: get top level git dir 8 | command: 'git rev-parse --show-toplevel' 9 | register: top_level_dir 10 | 11 | - name: build telemetry AMI 12 | command: 'time python -u -m provisioning.aws.create_telemetry_worker_ami provisioning/aws/telemetry_worker.hvm.json' 13 | args: 14 | chdir: '{{ top_level_dir.stdout }}' 15 | -------------------------------------------------------------------------------- /provisioning/ansible/playbooks/make_code_package.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - include: resources.yml 3 | 4 | - hosts: localhost 5 | connection: local 6 | gather_facts: false 7 | 8 | tasks: 9 | - name: get top level git dir 10 | command: 'git rev-parse --show-toplevel' 11 | register: top_level_dir 12 | 13 | - name: build and upload telemetry code 14 | command: make put SOURCES_BUCKET={{resources_cfn.stack_outputs.AnalysisSourcesBucket}} VERSION={{ sources_version }} 15 | args: 16 | chdir: "{{ top_level_dir.stdout }}/http/analysis-service" 17 | -------------------------------------------------------------------------------- /provisioning/ansible/playbooks/resources.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: localhost 3 | connection: local 4 | tasks: 5 | - include_vars: ../envs/dev_secrets.yml 6 | 7 | - name: get top level git dir 8 | command: 'git rev-parse --show-toplevel' 9 | register: top_level_dir 10 | 11 | - name: make resources template 12 | command: make analysis-resources.json 13 | args: 14 | chdir: "{{top_level_dir.stdout}}/http/analysis-service" 15 | 16 | - name: create resources CFN 17 | cloudformation: stack_name="{{stack_name}}-resources" region={{region}} state=present 18 | args: 19 | template: "{{top_level_dir.stdout}}/http/analysis-service/analysis-resources.json" 20 | tags: 21 | App: "telemetry" 22 | Env: "{{env}}" 23 | Stack: "{{stack_name}}" 24 | template_parameters: 25 | ELBSSLCertARN: "{{ssl_cert_arn}}" 26 | AnalysisDatabasePassword : "{{database_password}}" 27 | register: resources_cfn 28 | -------------------------------------------------------------------------------- /provisioning/ansible/playbooks/route53.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - include: resources.yml 3 | 4 | - hosts: localhost 5 | connection: local 6 | tasks: 7 | - name: promote stack 8 | cloudformation: stack_name="{{stack_name}}-route53" region={{region}} state=present 9 | args: 10 | template: ../templates/route53.json 11 | template_parameters: 12 | DNSName: "{{dns_name}}" 13 | DNSZoneName: "{{dns_zone_name}}" 14 | ELBDNSName: "{{resources_cfn.stack_outputs.AnalysisLoadBalancerDNSName}}" 15 | ELBZoneNameID: "{{resources_cfn.stack_outputs.AnalysisLoadBalancerZoneID}}" 16 | Environment: "{{env}}" 17 | Region: "{{region}}" 18 | AnalysisPublicCDNDomainName: "{{resources_cfn.stack_outputs.AnalysisPublicCDNDomainName}}" 19 | AnalysisPublicDomainName: "{{public_analysis_dns_name}}" 20 | tags: 21 | App: "telemetry" 22 | Env: "{{env}}" 23 | Stack: "{{stack_name}}" 24 | register: promote 25 | 26 | - debug: var=promote 27 | -------------------------------------------------------------------------------- /provisioning/ansible/templates/route53.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSTemplateFormatVersion": "2010-09-09", 3 | "Description": "Telemetry analysis Route53", 4 | "Parameters": { 5 | "DNSName": { 6 | "Type": "String" 7 | }, 8 | "DNSZoneName": { 9 | "Default": "dev.mozaws.net.", 10 | "Type": "String" 11 | }, 12 | "ELBZoneNameID": { 13 | "Description": "From app stack.", 14 | "Type": "String" 15 | }, 16 | "ELBDNSName": { 17 | "Description": "From app stack.", 18 | "Type": "String" 19 | }, 20 | "Environment": { 21 | "Description": "Environment", 22 | "Type": "String", 23 | "Default": "dev" 24 | }, 25 | "Region": { 26 | "Description": "Environment", 27 | "Type": "String", 28 | "Default": "us-west-2" 29 | }, 30 | "AnalysisPublicCDNDomainName": { 31 | "Type": "String" 32 | }, 33 | "AnalysisPublicDomainName": { 34 | "Type": "String" 35 | } 36 | }, 37 | "Resources": { 38 | "R53AliasRecord": { 39 | "Type": "AWS::Route53::RecordSet", 40 | "Properties": { 41 | "HostedZoneName": { 42 | "Ref": "DNSZoneName" 43 | }, 44 | "Name": { 45 | "Ref": "DNSName" 46 | }, 47 | "Type": "A", 48 | "Region": { 49 | "Ref": "Region" 50 | }, 51 | "AliasTarget": { 52 | "EvaluateTargetHealth": true, 53 | "HostedZoneId": { 54 | "Ref": "ELBZoneNameID" 55 | }, 56 | "DNSName": { 57 | "Ref": "ELBDNSName" 58 | } 59 | }, 60 | "SetIdentifier": { 61 | "Ref": "Region" 62 | } 63 | } 64 | }, 65 | "PublicBucketR53Record": { 66 | "Type": "AWS::Route53::RecordSet", 67 | "Properties": { 68 | "HostedZoneName": { 69 | "Ref": "DNSZoneName" 70 | }, 71 | "Name": { 72 | "Ref": "AnalysisPublicDomainName" 73 | }, 74 | "Type": "CNAME", 75 | "ResourceRecords": [ 76 | { 77 | "Ref": "AnalysisPublicCDNDomainName" 78 | } 79 | ], 80 | "TTL": 60 81 | } 82 | } 83 | }, 84 | "Outputs": { 85 | "Domain": { 86 | "Value": { 87 | "Ref": "PublicBucketR53Record" 88 | } 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /provisioning/aws/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/provisioning/aws/__init__.py -------------------------------------------------------------------------------- /provisioning/aws/aws_incoming.example.json: -------------------------------------------------------------------------------- 1 | { 2 | "ssl_user": "ubuntu", 3 | "ssl_key_name": "mreid", 4 | "ssl_key_path": "~/.ssh/aws/mreid.pem", 5 | "ssl_retries": 10, 6 | "incoming_bucket": "mreid-telemetry-incoming-test", 7 | "incoming_batch_size": 4, 8 | "publish_bucket": "mreid-telemetry-published-test", 9 | "image": "ami-76831f46", 10 | "skip_conversion": false, 11 | "loop": true, 12 | "instance_type": "t1.micro", 13 | "security_groups": ["telemetry"], 14 | "region": "us-west-2", 15 | "placement": "us-west-2c", 16 | "name": "mreid-telemetry-process-incoming-test", 17 | "default_tags": { 18 | "Owner": "mreid", 19 | "Application": "telemetry-server" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /provisioning/aws/aws_incoming.prod.json: -------------------------------------------------------------------------------- 1 | { 2 | "incoming_bucket": "telemetry-incoming-v2", 3 | "incoming_queue": "telemetry-incoming-v1", 4 | "incoming_batch_size": 8, 5 | "publish_bucket": "telemetry-published-v2", 6 | "image": "ami-76831f46", 7 | "skip_conversion": false, 8 | "loop": true, 9 | "instance_type": "c1.xlarge", 10 | "ssl_key_name": "mreid", 11 | "ssl_key_path": "~/.ssh/aws/mreid.pem", 12 | "ssl_retries": 10, 13 | "ssl_user": "ubuntu", 14 | "security_groups": ["telemetry"], 15 | "region": "us-west-2", 16 | "placement": "us-west-2c", 17 | "name": "telemetry-process-incoming-v1-1", 18 | "default_tags": { 19 | "Owner": "mreid", 20 | "Application": "telemetry-server" 21 | }, 22 | "ephemeral_map": { 23 | "/dev/xvdb": "ephemeral0", 24 | "/dev/xvdc": "ephemeral1", 25 | "/dev/xvdd": "ephemeral2", 26 | "/dev/xvde": "ephemeral3" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /provisioning/aws/aws_telemetry_server_config.example.json: -------------------------------------------------------------------------------- 1 | { 2 | "ssl_user": "ubuntu", 3 | "ssl_key_name": "mreid", 4 | "ssl_key_path": "~/.ssh/aws/mreid.pem", 5 | "ssl_retries": 10, 6 | "base_dir": "/mnt/telemetry", 7 | "instance_type": "m1.xlarge", 8 | "image": "ami-bf1d8a8f", 9 | "security_groups": ["telemetry"], 10 | "region": "us-west-2", 11 | "placement": "us-west-2c", 12 | "shutdown_behavior": "stop", 13 | "name": "mreid-telemetry-server-test", 14 | "incoming_bucket": "mreid-telemetry-incoming-test", 15 | "process_incoming_config": "./aws_incoming.example.json", 16 | "default_tags": { 17 | "Owner": "mreid", 18 | "Application": "telemetry-server" 19 | }, 20 | "skip_termination": true, 21 | "ephemeral_map": { 22 | "/dev/xvdb": "ephemeral0", 23 | "/dev/xvdc": "ephemeral1", 24 | "/dev/xvdd": "ephemeral2", 25 | "/dev/xvde": "ephemeral3" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /provisioning/aws/aws_telemetry_server_config.prod.json: -------------------------------------------------------------------------------- 1 | { 2 | "ssl_user": "ubuntu", 3 | "ssl_key_name": "mreid", 4 | "ssl_key_path": "~/.ssh/aws/mreid.pem", 5 | "ssl_retries": 10, 6 | "base_dir": "/mnt/telemetry", 7 | "instance_type": "m1.small", 8 | "image": "ami-ace67f9c", 9 | "security_groups": ["telemetry"], 10 | "region": "us-west-2", 11 | "placement": "us-west-2c", 12 | "shutdown_behavior": "stop", 13 | "name": "telemetry-server-v1-primary-1", 14 | "incoming_bucket": "telemetry-incoming-v2", 15 | "incoming_queue": "telemetry-incoming-v1", 16 | "process_incoming_config": "provisioning/aws/aws_incoming.prod.json", 17 | "primary_server": true, 18 | "default_tags": { 19 | "Owner": "mreid", 20 | "Application": "telemetry-server" 21 | }, 22 | "skip_termination": true 23 | } 24 | -------------------------------------------------------------------------------- /provisioning/aws/aws_telemetry_server_config.prod_secondary.json: -------------------------------------------------------------------------------- 1 | { 2 | "ssl_user": "ubuntu", 3 | "ssl_key_name": "mreid", 4 | "ssl_key_path": "~/.ssh/aws/mreid.pem", 5 | "ssl_retries": 10, 6 | "base_dir": "/mnt/telemetry", 7 | "instance_type": "m1.small", 8 | "image": "ami-ace67f9c", 9 | "security_groups": ["telemetry"], 10 | "region": "us-west-2", 11 | "placement": "us-west-2c", 12 | "shutdown_behavior": "stop", 13 | "name": "telemetry-server-v1-secondary-1", 14 | "incoming_bucket": "telemetry-incoming-v2", 15 | "incoming_queue": "telemetry-incoming-v1", 16 | "process_incoming_config": "provisioning/aws/aws_incoming.prod.json", 17 | "primary_server": false, 18 | "default_tags": { 19 | "Owner": "mreid", 20 | "Application": "telemetry-server" 21 | }, 22 | "skip_termination": true 23 | } 24 | -------------------------------------------------------------------------------- /provisioning/aws/create_ami.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import time 9 | from datetime import date 10 | 11 | class AmiCreator: 12 | def __init__(self, launcher): 13 | self.launcher = launcher 14 | 15 | def create(self, description=None): 16 | self.launcher.go() 17 | conn = self.launcher.get_connection() 18 | instance = self.launcher.get_instance() 19 | print "Instance", instance.id, "is now configured. Stopping it." 20 | stopping_instances = conn.stop_instances(instance_ids=[instance.id]) 21 | instance.update() 22 | for i in range(120): 23 | print i, "Instance is", instance.state 24 | if instance.state == "stopped": 25 | break 26 | time.sleep(1) 27 | instance.update() 28 | 29 | print "Creating an AMI..." 30 | # Create an AMI (after stopping the instance) 31 | # Give it a good name %s-yyyymmdd where %s is instance name stolen from 32 | # launcher which reads it from config or commandline 33 | base_name = self.launcher.config["name"] 34 | ami_name = "{0}-{1}".format(base_name, date.today().strftime("%Y%m%d")) 35 | ami_desc = description 36 | if ami_desc is None: 37 | ami_desc = self.launcher.config.get("description") 38 | if ami_desc is None: 39 | ami_desc = 'Generic Telemetry Image' 40 | # This automatically stops the image first (unless you tell it not to) 41 | ami_id = conn.create_image(instance.id, ami_name, description=ami_desc) 42 | print "Created a new AMI:" 43 | print " ID:", ami_id 44 | print " Name:", ami_name 45 | print " Desc:", ami_desc 46 | # Get the image and wait for it to be available: 47 | ami_image = conn.get_image(ami_id) 48 | retry_count = 0 49 | while retry_count < 15 and ami_image.state != "available": 50 | retry_count += 1 51 | print "AMI is", ami_image.state, "... waiting 10s for it to become available" 52 | time.sleep(10) 53 | ami_image.update() 54 | print "AMI is", ami_image.state 55 | if ami_image.state != "available": 56 | print "The image is not quite available yet, but you're probably bored of waiting, so we'll continue." 57 | # Now clean up the instance. 58 | print "Terminating instance", instance.id 59 | self.launcher.terminate(conn, instance) 60 | print "Those AMI details again:" 61 | print " ID:", ami_id 62 | print " Name:", ami_name 63 | print " Desc:", ami_desc 64 | return 0 65 | -------------------------------------------------------------------------------- /provisioning/aws/create_telemetry_base_ami.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | # Example invocation: 9 | # $ cd /path/to/telemetry-server 10 | # $ python -m provisioning.aws.create_telemetry_base_ami -k "my_aws_key" -s "my_aws_secret" provisioning/aws/telemetry_server_base.pv.json 11 | 12 | from launch_telemetry_server import TelemetryServerLauncher 13 | from create_ami import AmiCreator 14 | import sys 15 | import traceback 16 | 17 | def main(): 18 | launcher = TelemetryServerLauncher() 19 | creator = AmiCreator(launcher) 20 | try: 21 | result = creator.create('Pre-loaded image for telemetry nodes. Knows ' \ 22 | 'how to run all the core services, but does ' \ 23 | 'not auto-start them on boot.') 24 | return result 25 | except Exception, e: 26 | print "Error:", e 27 | traceback.print_exc() 28 | return 1 29 | 30 | if __name__ == "__main__": 31 | sys.exit(main()) 32 | -------------------------------------------------------------------------------- /provisioning/aws/create_telemetry_worker_ami.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | # Example invocation: 9 | # $ cd /path/to/telemetry-server 10 | # $ python -m provisioning.aws.create_telemetry_worker_ami -k "my_aws_key" -s "my_aws_secret" provisioning/aws/telemetry_worker.hvm.json 11 | 12 | from aws_launcher import Launcher 13 | from create_ami import AmiCreator 14 | import sys 15 | import traceback 16 | 17 | def main(): 18 | launcher = Launcher() 19 | creator = AmiCreator(launcher) 20 | try: 21 | result = creator.create('Pre-loaded image for telemetry workers. Use ' \ 22 | 'it for scheduled or adhoc jobs.') 23 | return result 24 | except Exception, e: 25 | print "Error:", e 26 | traceback.print_exc() 27 | return 1 28 | 29 | if __name__ == "__main__": 30 | sys.exit(main()) 31 | -------------------------------------------------------------------------------- /provisioning/aws/launch_mapreduce_job.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import boto.ec2 9 | import time 10 | import os 11 | import simplejson as json 12 | from fabric.api import * 13 | from fabric.exceptions import NetworkError 14 | import sys 15 | import aws_util 16 | from aws_launcher import Launcher 17 | import traceback 18 | 19 | class MapReduceLauncher(Launcher): 20 | def post_install(self, instance): 21 | base_dir = self.config.get("base_dir", "/mnt/telemetry") 22 | with cd(base_dir): 23 | # "data" is a dummy dir just to give it somewhere to look for local data. 24 | run("mkdir job work data") 25 | 26 | def run(self, instance): 27 | home = "/home/" + self.ssl_user 28 | mr_cfg = self.config["mapreduce"] 29 | base_dir = self.config.get("base_dir", "/mnt/telemetry") 30 | job_dir = base_dir + "/job" 31 | data_dir = base_dir + "/data" 32 | work_dir = base_dir + "/work" 33 | with cd(home + "/telemetry-server"): 34 | job_script = mr_cfg["job_script"] 35 | input_filter = mr_cfg["input_filter"] 36 | put(job_script, job_dir) 37 | put(input_filter, job_dir) 38 | job_script_path = os.path.join(job_dir, os.path.basename(job_script)) 39 | input_filter_path = os.path.join(job_dir, os.path.basename(input_filter)) 40 | output_path = os.path.join(job_dir, "output.txt") 41 | job_args = (job_script_path, input_filter_path, data_dir, work_dir, output_path, self.aws_key, self.aws_secret_key, mr_cfg["data_bucket"]) 42 | run('python -m mapreduce.job %s --input-filter %s --data-dir %s --work-dir %s --output %s --aws-key "%s" --aws-secret-key "%s" --bucket "%s"' % job_args) 43 | # TODO: consult "output_compression" 44 | run("xz " + output_path) 45 | # TODO: upload job/output.txt.xz to S3 output_bucket.output_filename 46 | result = get(output_path + ".xz", mr_cfg["output_filename"]) 47 | # TODO: check result.succeeded before bailing. 48 | 49 | def main(): 50 | try: 51 | launcher = MapReduceLauncher() 52 | launcher.go() 53 | return 0 54 | except Exception, e: 55 | print "Error:", e 56 | traceback.print_exc() 57 | return 1 58 | 59 | if __name__ == "__main__": 60 | sys.exit(main()) 61 | -------------------------------------------------------------------------------- /provisioning/aws/process_incoming_queue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import time 9 | from fabric.api import * 10 | import sys 11 | import aws_util 12 | from process_incoming_distributed import ProcessIncomingLauncher 13 | import traceback 14 | 15 | 16 | class ProcessIncomingQueueLauncher(ProcessIncomingLauncher): 17 | def run(self, instance): 18 | home = "/home/" + self.ssl_user 19 | 20 | # Update from github 21 | with cd(home + "/telemetry-server"): 22 | run("git pull") 23 | 24 | q_conn = aws_util.connect_sqs(self.config["region"], self.aws_key, self.aws_secret_key) 25 | incoming_queue = q_conn.get_queue(self.config["incoming_queue"]) 26 | 27 | if self.config.get("loop", False): 28 | while True: 29 | if incoming_queue.count() == 0: 30 | print "No files to process yet. Sleeping for a while..." 31 | # TODO: Terminate 'instance' and fire up a new one when we need it? 32 | time.sleep(60) 33 | continue 34 | self.process_incoming_queue(instance) 35 | else: 36 | self.process_incoming_queue(instance) 37 | 38 | def process_incoming_queue(self, instance): 39 | home = "/home/" + self.ssl_user 40 | with cd(home + "/telemetry-server"): 41 | skip_conversion = "" 42 | if self.config.get("skip_conversion", False): 43 | skip_conversion = "--skip-conversion" 44 | print "Processing incoming queue:", self.config["incoming_queue"] 45 | run('python -m process_incoming.process_incoming_mp --bad-data-log /mnt/telemetry/bad_records.txt -k "%s" -s "%s" -r "%s" -w /mnt/telemetry/work -o /mnt/telemetry/processed -t ./telemetry/telemetry_schema.json -q "%s" %s %s %s' % (self.aws_key, self.aws_secret_key, self.config["region"], self.config["incoming_queue"], skip_conversion, self.config["incoming_bucket"], self.config["publish_bucket"])) 46 | 47 | def main(): 48 | try: 49 | launcher = ProcessIncomingQueueLauncher() 50 | launcher.go() 51 | return 0 52 | except Exception, e: 53 | print "Error:", e 54 | traceback.print_exc() 55 | return 1 56 | 57 | if __name__ == "__main__": 58 | sys.exit(main()) 59 | -------------------------------------------------------------------------------- /provisioning/aws/telemetry_server_base.hvm.json: -------------------------------------------------------------------------------- 1 | { 2 | "ssl_user": "ubuntu", 3 | "ssl_key_name": "mreid", 4 | "ssl_key_path": "~/.ssh/aws/mreid.pem", 5 | "ssl_retries": 10, 6 | "base_dir": "/mnt/telemetry", 7 | "instance_type": "c3.xlarge", 8 | "image": "ami-5189a661", 9 | "security_groups": ["telemetry"], 10 | "region": "us-west-2", 11 | "placement": "us-west-2c", 12 | "shutdown_behavior": "stop", 13 | "name": "telemetry-server-base-hvm", 14 | "default_tags": { 15 | "Owner": "mreid", 16 | "Application": "telemetry-server" 17 | }, 18 | "add_aws_credentials": true, 19 | "skip_termination": true 20 | } 21 | -------------------------------------------------------------------------------- /provisioning/aws/telemetry_server_base.pv.json: -------------------------------------------------------------------------------- 1 | { 2 | "ssl_user": "ubuntu", 3 | "ssl_key_name": "mreid", 4 | "ssl_key_path": "~/.ssh/aws/mreid.pem", 5 | "ssl_retries": 10, 6 | "base_dir": "/mnt/telemetry", 7 | "instance_type": "t1.micro", 8 | "image": "ami-6989a659", 9 | "security_groups": ["telemetry"], 10 | "region": "us-west-2", 11 | "placement": "us-west-2c", 12 | "shutdown_behavior": "stop", 13 | "name": "telemetry-server-base-pv", 14 | "default_tags": { 15 | "Owner": "mreid", 16 | "Application": "telemetry-server" 17 | }, 18 | "add_aws_credentials": true, 19 | "skip_termination": true 20 | } 21 | -------------------------------------------------------------------------------- /provisioning/aws/telemetry_worker.hvm.json: -------------------------------------------------------------------------------- 1 | { 2 | "ssl_user": "ubuntu", 3 | "ssl_key_name": "mreid", 4 | "ssl_key_path": "~/.ssh/aws/mreid.pem", 5 | "ssl_retries": 10, 6 | "upgrade_os": true, 7 | "base_dir": "/mnt/telemetry", 8 | "instance_type": "c3.2xlarge", 9 | "image": "ami-2cfe1a1f", 10 | "security_groups": ["telemetry"], 11 | "region": "us-west-2", 12 | "placement": "us-west-2c", 13 | "shutdown_behavior": "stop", 14 | "name": "telemetry-worker-hvm", 15 | "default_tags": { 16 | "Owner": "mreid", 17 | "Application": "telemetry-server" 18 | }, 19 | "skip_termination": true 20 | } 21 | -------------------------------------------------------------------------------- /provisioning/config/boto.cfg: -------------------------------------------------------------------------------- 1 | [Boto] 2 | debug = 0 3 | num_retries = 20 4 | metadata_service_timeout = 3.0 5 | metadata_service_num_attempts = 20 6 | -------------------------------------------------------------------------------- /provisioning/config/telemetry_aws.prod.json: -------------------------------------------------------------------------------- 1 | { 2 | "incoming_bucket": "telemetry-incoming-v2", 3 | "incoming_queue": "telemetry-incoming-v1", 4 | "publish_bucket": "telemetry-published-v2", 5 | "aws_region": "us-west-2" 6 | } 7 | -------------------------------------------------------------------------------- /server/server_config.spot.json: -------------------------------------------------------------------------------- 1 | { 2 | "motd": " ==== Spot Telemetry Server. Accepting Submissions since 2013. ====", 3 | "max_data_lenth": 204800, 4 | "max_path_lenth": 10240, 5 | "port": 8080, 6 | "log_path": "/mnt/telemetry/data", 7 | "max_log_age_ms": 120000, 8 | "max_log_size": 50000000, 9 | "stats_log_file": "/var/log/telemetry/telemetry-server.log" 10 | } 11 | -------------------------------------------------------------------------------- /telemetry/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/telemetry/__init__.py -------------------------------------------------------------------------------- /telemetry/telemetry_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dimensions": [ 4 | { 5 | "field_name": "reason", 6 | "allowed_values": ["idle-daily","saved-session","android-anr-report","ftu","loop","flash-video","appusage","main"] 7 | }, 8 | { 9 | "field_name": "appName", 10 | "allowed_values": ["Firefox","Fennec","Thunderbird","FirefoxOS","B2G"] 11 | }, 12 | { 13 | "field_name": "appUpdateChannel", 14 | "allowed_values": ["release", "beta", "nightly", "aurora"] 15 | }, 16 | { 17 | "field_name": "appVersion", 18 | "allowed_values": "*" 19 | }, 20 | { 21 | "field_name": "appBuildID", 22 | "allowed_values": "*" 23 | }, 24 | { 25 | "field_name": "submission_date", 26 | "allowed_values": "*" 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /telemetry/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/telemetry/util/__init__.py -------------------------------------------------------------------------------- /telemetry/util/bench.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import time 9 | from contextlib import contextmanager 10 | @contextmanager 11 | def bench(label): 12 | start = time.clock() 13 | try: 14 | yield 15 | finally: 16 | duration = time.clock() - start 17 | print label, "Elapsed time:", duration, "seconds" 18 | 19 | import re 20 | from string import maketrans 21 | x = "hello" 22 | trantab = maketrans("\r\n", " ") 23 | eols = re.compile('[\r\n]') 24 | input2 = "FFFFFFFFFFFFFFFFFFFFFFFF FFFFF" * 3000 25 | input = "FFFFFFFF\rFFFFFFF\nFF FFF \r\r\n\n" * 3000 26 | 27 | with bench("Translate (with eols)"): 28 | for i in range(10000): 29 | if "\r" in input or "\n" in input: 30 | x = input.translate(trantab) 31 | 32 | with bench("Replace (with eols)"): 33 | for i in range(10000): 34 | if "\r" in input or "\n" in input: 35 | x = input.replace("\r", " ").replace("\n", " ") 36 | 37 | with bench("Regex (with eols)"): 38 | for i in range(10000): 39 | if "\r" in input or "\n" in input: 40 | x, count = eols.subn(" ", input) 41 | 42 | with bench("Translate (no eols)"): 43 | for i in range(10000): 44 | if "\r" in input2 or "\n" in input2: 45 | x = input2.translate(trantab) 46 | 47 | with bench("Replace (no eols)"): 48 | for i in range(10000): 49 | if "\r" in input2 or "\n" in input2: 50 | x = input2.replace("\r", " ").replace("\n", " ") 51 | 52 | with bench("Regex (no eols)"): 53 | for i in range(10000): 54 | if "\r" in input2 or "\n" in input2: 55 | x, count = eols.subn(" ", input2) 56 | -------------------------------------------------------------------------------- /telemetry/util/bucket_list.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import socket 3 | import sys 4 | import traceback 5 | import boto 6 | from boto.s3.connection import S3Connection 7 | from datetime import datetime 8 | import telemetry.util.timer as timer 9 | 10 | def s3obj_to_string(key): 11 | return u"\t".join((key.name, str(key.size), key.etag[1:-1])) 12 | 13 | # Update all files on or after submission date. 14 | def list_files(bucket_name, output_file, output_func=s3obj_to_string, prefix=''): 15 | s3 = S3Connection() 16 | bucket = s3.get_bucket(bucket_name) 17 | total_count = 0 18 | start_time = datetime.now() 19 | done = False 20 | last_key = '' 21 | while not done: 22 | try: 23 | for k in bucket.list(prefix=prefix, marker=last_key): 24 | last_key = k.name 25 | total_count += 1 26 | if total_count % 5000 == 0: 27 | print "Looked at", total_count, "total records in", timer.delta_sec(start_time), "seconds. Last key was", last_key 28 | try: 29 | output_file.write(str(output_func(k)) + "\n") 30 | except Exception, e: 31 | print "Error writing key", k.name, ":", e 32 | traceback.print_exc() 33 | done = True 34 | except socket.error, e: 35 | print "Error listing keys:", e 36 | traceback.print_exc() 37 | print "Continuing from last seen key:", last_key 38 | 39 | output_file.close() 40 | print "Overall, listed", total_count, "in", timer.delta_sec(start_time), "seconds" 41 | 42 | def main(): 43 | parser = argparse.ArgumentParser(description="List S3 contents (with retry) to a file") 44 | parser.add_argument("--output-file", type=argparse.FileType('w')) 45 | parser.add_argument("--bucket", default="telemetry-published-v2") 46 | parser.add_argument("--prefix", default="") 47 | parser.add_argument("--verbose", action="store_true") 48 | parser.add_argument("--debug", action="store_true") 49 | args = parser.parse_args() 50 | 51 | if args.debug: 52 | boto.set_stream_logger('boto') 53 | 54 | list_files(args.bucket, args.output_file, prefix=args.prefix) 55 | 56 | if __name__ == "__main__": 57 | sys.exit(main()) 58 | -------------------------------------------------------------------------------- /telemetry/util/convert_log_v0_to_v1.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import sys, struct 6 | 7 | fin = open(sys.argv[1], "rb") 8 | fout = open(sys.argv[2], "wb") 9 | 10 | record_count = 0; 11 | while True: 12 | header = fin.read(16) 13 | if header == '': 14 | break 15 | record_count += 1 16 | 17 | len_path, len_data, timestamp = struct.unpack(" 2 and sys.argv[2] == 'parse': 13 | parse = True 14 | 15 | record_count = 0 16 | bad_records = 0 17 | version = fu.detect_file_version(filename) 18 | print "It appears that this is a", version, "log file." 19 | for r in fu.unpack(filename, verbose=True, file_version=version): 20 | record_count += 1 21 | if r.error: 22 | print "Record", record_count, "was bad:", r.error 23 | bad_records += 1 24 | continue 25 | 26 | if parse: 27 | try: 28 | parsed_json = json.loads(r.data) 29 | except Exception, e: 30 | bad_records += 1 31 | print "Record", record_count, "failed to parse json:", e 32 | 33 | print "Processed", record_count, "records, with", bad_records, "bad records" 34 | -------------------------------------------------------------------------------- /test/test.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/test/test.txt.gz -------------------------------------------------------------------------------- /test/test.txt.lzma: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/test/test.txt.lzma -------------------------------------------------------------------------------- /test/test.txt.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/test/test.txt.xz -------------------------------------------------------------------------------- /test/unicode.v1.packed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-server/a685e20534f5175421a08807efb5e897e91fb43a/test/unicode.v1.packed --------------------------------------------------------------------------------