├── configs ├── config.json └── ch_types.json ├── README.md ├── utils.py ├── README_RU.md ├── metrica_logs_api.py ├── clickhouse.py └── logs_api.py /configs/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "token" : "", 3 | "counter_id": "", 4 | "disable_ssl_verification_for_clickhouse": 0, 5 | "visits_fields": [ 6 | "ym:s:counterID", 7 | "ym:s:dateTime", 8 | "ym:s:date", 9 | "ym:s:clientID" 10 | ], 11 | "hits_fields": [ 12 | "ym:pv:counterID", 13 | "ym:pv:dateTime", 14 | "ym:pv:date", 15 | "ym:pv:clientID" 16 | ], 17 | "log_level": "INFO", 18 | "retries": 1, 19 | "retries_delay": 60, 20 | "clickhouse": { 21 | "host": "http://localhost:8123", 22 | "user": "", 23 | "password": "", 24 | "visits_table": "visits_all", 25 | "hits_table": "hits_all", 26 | "database": "default" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [Russian version](README_RU.md) 2 | 3 | # DEPRECATED 4 | 5 | This repository is not supported. Please, refer to [guide](https://cloud.yandex.com/en/docs/datalens/tutorials/data-from-metrica-yc-visualization) on integration with Yandex.Metrica Logs API supported by Yandex.Cloud 6 | 7 | # Integration with Logs API 8 | This script can help you to integrate Yandex.Metrica Logs API with ClickHouse. 9 | 10 | 11 | ## Requirements 12 | Script uses Python 2.7 and also requires `requests` library. You can install this library using package manager [pip](https://pip.pypa.io/en/stable/installing/) 13 | ```bash 14 | pip install requests 15 | ``` 16 | 17 | Also, you need a running ClickHouse instance to load data into it. Instruction how to install ClickHouse can be found on [official site](https://clickhouse.yandex/). 18 | 19 | ## Setting up 20 | First of all, you need to fill in [config](./configs/config.json) 21 | ```javascript 22 | { 23 | "token" : "", // token to access Yandex.Metrica API 24 | "counter_id": "", 25 | "visits_fields": [ // list of params for visits 26 | "ym:s:counterID", 27 | "ym:s:dateTime", 28 | "ym:s:date", 29 | "ym:s:firstPartyCookie" 30 | ], 31 | "hits_fields": [ // list of params for hits 32 | "ym:pv:counterID", 33 | "ym:pv:dateTime", 34 | "ym:pv:date", 35 | "ym:pv:firstPartyCookie" 36 | ], 37 | "log_level": "INFO", 38 | "retries": 1, 39 | "retries_delay": 60, // delay between retries 40 | "clickhouse": { 41 | "host": "http://localhost:8123", 42 | "user": "", 43 | "password": "", 44 | "visits_table": "visits_all", // table name for visits 45 | "hits_table": "hits_all", // table name for hits 46 | "database": "default" // database name 47 | } 48 | } 49 | ``` 50 | 51 | On first execution script creates all tables in database according to config. So if you change parameters, you need to drop all tables and load data again or add new columns manually using [ALTER TABLE](https://clickhouse.yandex/reference_ru.html#ALTER). 52 | 53 | ## Running a program 54 | 55 | When running the program you need to specify a souce (hits or visits) using option `-source`. 56 | 57 | Script has several modes: 58 | * __history__ - loads all the data from day one to the day before yesterday 59 | * __regular__ - loads data only for day before yesterday (recommended for regular downloads) 60 | * __regular_early__ - loads yesterday data (yesterday data may be not complete: some visits can lack page views) 61 | 62 | Example: 63 | ```bash 64 | python metrica_logs_api.py -mode history -source visits 65 | ``` 66 | 67 | Also you can load data for particular time period: 68 | ```bash 69 | python metrica_logs_api.py -source hits -start_date 2016-10-10 -end_date 2016-10-18 70 | ``` 71 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import (absolute_import, division, 2 | print_function, unicode_literals) 3 | 4 | import json 5 | import argparse 6 | import requests 7 | import platform 8 | 9 | DATE_FORMAT = '%Y-%m-%d' 10 | 11 | class Structure: 12 | def __init__(self, **kwds): 13 | self.__dict__.update(kwds) 14 | 15 | def __str__(self): 16 | return json.dumps(self.__dict__, sort_keys=True, indent=2) 17 | 18 | def __repr__(self): 19 | return json.dumps(self.__dict__, sort_keys=True, indent=2) 20 | 21 | 22 | def validate_user_request(user_request): 23 | '''Validates initial user request''' 24 | assert user_request.source in ['hits', 'visits'], 'Invalid source' 25 | 26 | 27 | def validate_cli_options(options): 28 | '''Validates command line options''' 29 | assert options.source is not None, \ 30 | 'Source must be specified in CLI options' 31 | if options.mode is None: 32 | assert (options.start_date is not None) \ 33 | and (options.end_date is not None), \ 34 | 'Dates or mode must be specified' 35 | else: 36 | assert options.mode in ['history', 'regular', 'regular_early'], \ 37 | 'Wrong mode in CLI options' 38 | 39 | 40 | def get_cli_options(): 41 | '''Returns command line options''' 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument('-start_date', help = 'Start of period') 44 | parser.add_argument('-end_date', help = 'End of period') 45 | parser.add_argument('-mode', help = 'Mode (one of [history, reqular, regular_early])') 46 | parser.add_argument('-source', help = 'Source (hits or visits)') 47 | options = parser.parse_args() 48 | validate_cli_options(options) 49 | return options 50 | 51 | 52 | def get_counter_creation_date(counter_id, token): 53 | '''Returns create date for counter''' 54 | host = 'https://api-metrika.yandex.ru' 55 | url = '{host}/management/v1/counter/{counter_id}' \ 56 | .format(counter_id=counter_id, host=host) 57 | 58 | headers = {'Authorization': 'OAuth ' + token} 59 | 60 | r = requests.get(url, headers=headers) 61 | if r.status_code == 200: 62 | date = json.loads(r.text)['counter']['create_time'].split('T')[0] 63 | return date 64 | 65 | 66 | def get_config(): 67 | '''Returns user config''' 68 | with open('./configs/config.json') as input_file: 69 | config = json.loads(input_file.read()) 70 | 71 | assert 'counter_id' in config, 'CounterID must be specified in config' 72 | assert 'token' in config, 'Token must be specified in config' 73 | assert 'retries' in config, 'Number of retries should be specified in config' 74 | assert 'retries_delay' in config, 'Delay between retries should be specified in config' 75 | return config 76 | 77 | 78 | def get_ch_fields_config(): 79 | '''Returns config for ClickHouse columns\'s datatypes''' 80 | with open('./configs/ch_types.json') as input_file: 81 | ch_field_types = json.loads(input_file.read()) 82 | return ch_field_types 83 | 84 | def get_python_version(): 85 | return platform.python_version() 86 | -------------------------------------------------------------------------------- /README_RU.md: -------------------------------------------------------------------------------- 1 | # DEPRECATED 2 | 3 | Данный репозиторий больше не поддерживается. Пожалуйста, используйте [инструкцию](https://cloud.yandex.ru/docs/datalens/tutorials/data-from-metrica-yc-visualization) по работе с Logs API Яндекс.Метрики от Yandex.Cloud. 4 | 5 | # Интеграция с Logs API 6 | 7 | Данный скрипт реализует функциональность интеграции Logs API от Яндекс.Метрики с ClickHouse. 8 | 9 | По вопросам работы скрипта можно обращаться в комментариях на GitHub или по e-mail: miptgirl@yandex-team.ru. 10 | 11 | ## Требования 12 | Скрипт написан на Python 2.7, в нем используются библиотека `requests`, которую можно установить с помощью менеджера пакетов [pip](https://pip.pypa.io/en/stable/installing/) 13 | ```bash 14 | pip install requests 15 | ``` 16 | Кроме того, для работы необходима СУБД ClickHouse, инструкцию по ее установке можно найти на [официальном сайте](https://clickhouse.yandex/). 17 | 18 | ## Настройка 19 | Прежде всего, необходимо заполнить [config](./configs/config.json) 20 | ```javascript 21 | { 22 | "token" : "", // токен для доступа к API Яндекс.Метрики 23 | "counter_id": "", // номер счетчика 24 | "visits_fields": [ // список параметров визитов 25 | "ym:s:counterID", 26 | "ym:s:dateTime", 27 | "ym:s:date", 28 | "ym:s:firstPartyCookie" 29 | ], 30 | "hits_fields": [ // список параметров хитов 31 | "ym:pv:counterID", 32 | "ym:pv:dateTime", 33 | "ym:pv:date", 34 | "ym:pv:firstPartyCookie" 35 | ], 36 | "log_level": "INFO", // уровень логирования 37 | "retries": 1, // количество попыток перезапустить скрипт в случае ошибки 38 | "retries_delay": 60, // перерыв между попытками 39 | "clickhouse": { 40 | "host": "http://localhost:8123", // адрес поднятого инстанса ClickHouse 41 | "user": "", // логин для доступа к БД 42 | "password": "", // пароль для доступа в БД 43 | "visits_table": "visits_all", // имя таблицы для хранения визитов 44 | "hits_table": "hits_all", // имя таблицы для хранения хитов 45 | "database": "default" // имя базы данных для таблиц 46 | } 47 | } 48 | ``` 49 | При первом запуске скрипт создает необходимые таблицы в базе данных согласно заданным спискам полей. Поэтому при изменении списка в конфиге нужно либо полностью удалить таблицу и скачать данные заново, либо вручную добавить нужные колонки в базе данных с помощью команды [ALTER TABLE](https://clickhouse.yandex/reference_ru.html#ALTER). 50 | 51 | ## Запуск программы 52 | При запуске программы необходимо указать источник данных (хиты или визиты) с помощью опции `-source`. 53 | 54 | Скрипт умеет работать в нескольких режимах: 55 | * __history__ - скрипт выгрузит все данные с даты создания счетчика Метрики до позавчерашнего дня 56 | * __regular__ - в таком режиме будут выгружены данные за позавчера (рекомендуется использовать такой режим для регулярных выгрузок) 57 | * __regular_early__ - будут получены данные за вчерашний день 58 | 59 | Пример запуска программы: 60 | ```bash 61 | python metrica_logs_api.py -mode history -source visits 62 | ``` 63 | 64 | Кроме того, есть возможность получить данные за конкретный промежуток времени: 65 | ```bash 66 | python metrica_logs_api.py -source hits -start_date 2016-10-10 -end_date 2016-10-18 67 | ``` 68 | -------------------------------------------------------------------------------- /metrica_logs_api.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import logs_api 3 | import time 4 | import clickhouse 5 | import utils 6 | import sys 7 | import datetime 8 | import logging 9 | 10 | 11 | def setup_logging(config): 12 | global logger 13 | logger = logging.getLogger('logs_api') 14 | logging.basicConfig(stream=sys.stdout, 15 | level=config['log_level'], 16 | format='%(asctime)s %(processName)s %(levelname)-8s %(message)s', 17 | datefmt='%Y-%m-%d %H:%M:%S', ) 18 | 19 | 20 | def get_date_period(options): 21 | if options.mode is None: 22 | start_date_str = options.start_date 23 | end_date_str = options.end_date 24 | else: 25 | if options.mode == 'regular': 26 | start_date_str = (datetime.datetime.today() - datetime.timedelta(2)) \ 27 | .strftime(utils.DATE_FORMAT) 28 | end_date_str = (datetime.datetime.today() - datetime.timedelta(2)) \ 29 | .strftime(utils.DATE_FORMAT) 30 | elif options.mode == 'regular_early': 31 | start_date_str = (datetime.datetime.today() - datetime.timedelta(1)) \ 32 | .strftime(utils.DATE_FORMAT) 33 | end_date_str = (datetime.datetime.today() - datetime.timedelta(1)) \ 34 | .strftime(utils.DATE_FORMAT) 35 | elif options.mode == 'history': 36 | start_date_str = utils.get_counter_creation_date( 37 | config['counter_id'], 38 | config['token'] 39 | ) 40 | end_date_str = (datetime.datetime.today() - datetime.timedelta(2)) \ 41 | .strftime(utils.DATE_FORMAT) 42 | return start_date_str, end_date_str 43 | 44 | 45 | def build_user_request(config): 46 | options = utils.get_cli_options() 47 | logger.info('CLI Options: ' + str(options)) 48 | 49 | start_date_str, end_date_str = get_date_period(options) 50 | source = options.source 51 | 52 | # Validate that fields are present in config 53 | assert '{source}_fields'.format(source=source) in config, \ 54 | 'Fields must be specified in config' 55 | fields = config['{source}_fields'.format(source=source)] 56 | 57 | # Creating data structure (immutable tuple) with initial user request 58 | UserRequest = namedtuple( 59 | "UserRequest", 60 | "token counter_id start_date_str end_date_str source fields" 61 | ) 62 | 63 | user_request = UserRequest( 64 | token=config['token'], 65 | counter_id=config['counter_id'], 66 | start_date_str=start_date_str, 67 | end_date_str=end_date_str, 68 | source=source, 69 | fields=tuple(fields), 70 | ) 71 | 72 | logger.info(user_request) 73 | utils.validate_user_request(user_request) 74 | return user_request 75 | 76 | 77 | def integrate_with_logs_api(config, user_request): 78 | for i in range(config['retries']): 79 | time.sleep(i * config['retries_delay']) 80 | try: 81 | # Creating API requests 82 | api_requests = logs_api.get_api_requests(user_request) 83 | 84 | for api_request in api_requests: 85 | logger.info('### CREATING TASK') 86 | logs_api.create_task(api_request) 87 | print(api_request) 88 | 89 | delay = 20 90 | while api_request.status != 'processed': 91 | logger.info('### DELAY %d secs' % delay) 92 | time.sleep(delay) 93 | logger.info('### CHECKING STATUS') 94 | api_request = logs_api.update_status(api_request) 95 | logger.info('API Request status: ' + api_request.status) 96 | 97 | logger.info('### SAVING DATA') 98 | for part in range(api_request.size): 99 | logger.info('Part #' + str(part)) 100 | logs_api.save_data(api_request, part) 101 | 102 | logger.info('### CLEANING DATA') 103 | logs_api.clean_data(api_request) 104 | except Exception as e: 105 | logger.critical('Iteration #{i} failed'.format(i=i + 1)) 106 | logger.critical(e); 107 | if i == config['retries'] - 1: 108 | raise e 109 | 110 | if __name__ == '__main__': 111 | print('##### python', utils.get_python_version()) 112 | start_time = time.time() 113 | 114 | config = utils.get_config() 115 | setup_logging(config) 116 | 117 | user_request = build_user_request(config) 118 | 119 | 120 | # If data for specified period is already in database, script is skipped 121 | if clickhouse.is_data_present(user_request.start_date_str, 122 | user_request.end_date_str, 123 | user_request.source): 124 | logging.critical('Data for selected dates is already in database') 125 | exit(0) 126 | 127 | 128 | integrate_with_logs_api(config, user_request) 129 | 130 | end_time = time.time() 131 | logger.info('### TOTAL TIME: %d minutes %d seconds' % ( 132 | (end_time - start_time) / 60, 133 | (end_time - start_time) % 60 134 | )) 135 | 136 | -------------------------------------------------------------------------------- /clickhouse.py: -------------------------------------------------------------------------------- 1 | from __future__ import (absolute_import, division, 2 | print_function, unicode_literals) 3 | 4 | import requests 5 | import urllib 6 | import urllib3 7 | import utils 8 | import sys 9 | import logging 10 | 11 | config = utils.get_config() 12 | CH_HOST = config['clickhouse']['host'] 13 | CH_USER = config['clickhouse']['user'] 14 | CH_PASSWORD = config['clickhouse']['password'] 15 | CH_VISITS_TABLE = config['clickhouse']['visits_table'] 16 | CH_HITS_TABLE = config['clickhouse']['hits_table'] 17 | CH_DATABASE = config['clickhouse']['database'] 18 | SSL_VERIFY = (config['disable_ssl_verification_for_clickhouse'] == 0) 19 | 20 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 21 | 22 | logger = logging.getLogger('logs_api') 23 | 24 | def get_clickhouse_data(query, host=CH_HOST): 25 | '''Returns ClickHouse response''' 26 | logger.debug(query) 27 | if (CH_USER == '') and (CH_PASSWORD == ''): 28 | r = requests.post(host, data=query, verify=SSL_VERIFY) 29 | else: 30 | r = requests.post(host, data=query, auth=(CH_USER, CH_PASSWORD), verify=SSL_VERIFY) 31 | if r.status_code == 200: 32 | return r.text 33 | else: 34 | raise ValueError(r.text) 35 | 36 | 37 | def upload(table, content, host=CH_HOST): 38 | '''Uploads data to table in ClickHous''' 39 | content = content.encode('utf-8') 40 | query_dict = { 41 | 'query': 'INSERT INTO ' + table + ' FORMAT TabSeparatedWithNames ' 42 | } 43 | if (CH_USER == '') and (CH_PASSWORD == ''): 44 | r = requests.post(host, data=content, params=query_dict, verify=SSL_VERIFY) 45 | else: 46 | r = requests.post(host, data=content, params=query_dict, 47 | auth=(CH_USER, CH_PASSWORD), verify=SSL_VERIFY) 48 | result = r.text 49 | if r.status_code == 200: 50 | return result 51 | else: 52 | raise ValueError(r.text) 53 | 54 | 55 | def get_source_table_name(source, with_db=True): 56 | '''Returns table name in database''' 57 | if source == 'hits': 58 | if with_db: 59 | return '{db}.{table}'.format(db=CH_DATABASE, table=CH_HITS_TABLE) 60 | else: 61 | return CH_HITS_TABLE 62 | if source == 'visits': 63 | if with_db: 64 | return '{db}.{table}'.format(db=CH_DATABASE, table=CH_VISITS_TABLE) 65 | else: 66 | return CH_VISITS_TABLE 67 | 68 | 69 | def get_tables(): 70 | '''Returns list of tables in database''' 71 | return get_clickhouse_data('SHOW TABLES FROM {db}'.format(db=CH_DATABASE))\ 72 | .strip().split('\n') 73 | 74 | def get_dbs(): 75 | ''''Returns list of databases''' 76 | return get_clickhouse_data('SHOW DATABASES')\ 77 | .strip().split('\n') 78 | 79 | 80 | def is_table_present(source): 81 | '''Returns whether table for data is already present in database''' 82 | return get_source_table_name(source, with_db=False) in get_tables() 83 | 84 | def is_db_present(): 85 | '''Returns whether a database is already present in clickhouse''' 86 | return CH_DATABASE in get_dbs() 87 | 88 | def create_db(): 89 | '''Creates database in clickhouse''' 90 | return get_clickhouse_data('CREATE DATABASE {db}'.format(db=CH_DATABASE)) 91 | 92 | 93 | def get_ch_field_name(field_name): 94 | '''Converts Logs API parameter name to ClickHouse column name''' 95 | prefixes = ['ym:s:', 'ym:pv:'] 96 | for prefix in prefixes: 97 | field_name = field_name.replace(prefix, '') 98 | return field_name[0].upper() + field_name[1:] 99 | 100 | 101 | def drop_table(source): 102 | '''Drops table in ClickHouse''' 103 | query = 'DROP TABLE IF EXISTS {table}'.format( 104 | table=get_source_table_name(source)) 105 | get_clickhouse_data(query) 106 | 107 | 108 | def create_table(source, fields): 109 | '''Creates table in ClickHouse for hits/visits with particular fields''' 110 | tmpl = ''' 111 | CREATE TABLE {table_name} ( 112 | {fields} 113 | ) ENGINE = {engine} 114 | ''' 115 | field_tmpl = '{name} {type}' 116 | field_statements = [] 117 | 118 | table_name = get_source_table_name(source) 119 | if source == 'hits': 120 | if ('ym:pv:date' in fields) and ('ym:pv:clientID' in fields): 121 | engine = 'MergeTree(Date, intHash32(ClientID), (Date, intHash32(ClientID)), 8192)' 122 | else: 123 | engine = 'Log' 124 | 125 | if source == 'visits': 126 | if ('ym:s:date' in fields) and ('ym:s:clientID' in fields): 127 | engine = 'MergeTree(Date, intHash32(ClientID), (Date, intHash32(ClientID)), 8192)' 128 | else: 129 | engine = 'Log' 130 | 131 | ch_field_types = utils.get_ch_fields_config() 132 | ch_fields = list(map(get_ch_field_name, fields)) 133 | 134 | for i in range(len(fields)): 135 | field_statements.append(field_tmpl.format(name= ch_fields[i], 136 | type=ch_field_types[fields[i]])) 137 | 138 | field_statements = sorted(field_statements) 139 | query = tmpl.format(table_name=table_name, 140 | engine=engine, 141 | fields=',\n'.join(sorted(field_statements))) 142 | 143 | get_clickhouse_data(query) 144 | 145 | 146 | def save_data(source, fields, data): 147 | '''Inserts data into ClickHouse table''' 148 | 149 | if not is_db_present(): 150 | logger.info('Database created') 151 | create_db() 152 | 153 | if not is_table_present(source): 154 | logger.info('Table created') 155 | create_table(source, fields) 156 | 157 | upload(get_source_table_name(source), data) 158 | 159 | 160 | def is_data_present(start_date_str, end_date_str, source): 161 | '''Returns whether there is a records in database for particular date range and source''' 162 | if not is_db_present(): 163 | return False 164 | 165 | if not is_table_present(source): 166 | return False 167 | 168 | table_name = get_source_table_name(source) 169 | query = ''' 170 | SELECT count() 171 | FROM {table} 172 | WHERE Date >= '{start_date}' AND Date <= '{end_date}' 173 | '''.format(table=table_name, 174 | start_date=start_date_str, 175 | end_date=end_date_str) 176 | 177 | visits = get_clickhouse_data(query, CH_HOST) 178 | is_null = (visits == '') or (visits.strip() == '0') 179 | return not is_null 180 | 181 | -------------------------------------------------------------------------------- /logs_api.py: -------------------------------------------------------------------------------- 1 | from __future__ import (absolute_import, division, 2 | print_function, unicode_literals) 3 | 4 | import requests 5 | 6 | import json 7 | import utils 8 | import clickhouse 9 | import datetime 10 | import logging 11 | 12 | if utils.get_python_version().startswith('2'): 13 | from urllib import urlencode 14 | else: 15 | from urllib.parse import urlencode 16 | 17 | 18 | logger = logging.getLogger('logs_api') 19 | 20 | HOST = 'https://api-metrika.yandex.ru' 21 | 22 | 23 | def get_estimation(user_request): 24 | '''Returns estimation of Logs API (whether it's possible to load data and max period in days)''' 25 | url_params = urlencode( 26 | [ 27 | ('date1', user_request.start_date_str), 28 | ('date2', user_request.end_date_str), 29 | ('source', user_request.source), 30 | ('fields', ','.join(user_request.fields)) 31 | ] 32 | ) 33 | 34 | headers = {'Authorization': 'OAuth ' + user_request.token} 35 | 36 | url = '{host}/management/v1/counter/{counter_id}/logrequests/evaluate?'\ 37 | .format(host=HOST, counter_id=user_request.counter_id) + url_params 38 | 39 | r = requests.get(url, headers=headers) 40 | 41 | if r.status_code == 200: 42 | return json.loads(r.text)['log_request_evaluation'] 43 | else: 44 | raise ValueError(r) 45 | 46 | 47 | def get_api_requests(user_request): 48 | '''Returns list of API requests for UserRequest''' 49 | api_requests = [] 50 | estimation = get_estimation(user_request) 51 | if estimation['possible']: 52 | api_request = utils.Structure( 53 | user_request=user_request, 54 | date1_str=user_request.start_date_str, 55 | date2_str=user_request.end_date_str, 56 | status='new' 57 | ) 58 | api_requests.append(api_request) 59 | elif estimation['max_possible_day_quantity'] != 0: 60 | start_date = datetime.datetime.strptime( 61 | user_request.start_date_str, 62 | utils.DATE_FORMAT 63 | ) 64 | 65 | end_date = datetime.datetime.strptime( 66 | user_request.end_date_str, 67 | utils.DATE_FORMAT 68 | ) 69 | 70 | days = (end_date - start_date).days 71 | num_requests = int(days/estimation['max_possible_day_quantity']) + 1 72 | days_in_period = int(days/num_requests) + 1 73 | for i in range(num_requests): 74 | date1 = start_date + datetime.timedelta(i*days_in_period) 75 | date2 = min( 76 | end_date, 77 | start_date + datetime.timedelta((i+1)*days_in_period - 1) 78 | ) 79 | 80 | api_request = utils.Structure( 81 | user_request=user_request, 82 | date1_str=date1.strftime(utils.DATE_FORMAT), 83 | date2_str=date2.strftime(utils.DATE_FORMAT), 84 | status='new' 85 | ) 86 | api_requests.append(api_request) 87 | else: 88 | raise RuntimeError('Logs API can\'t load data: max_possible_day_quantity = 0') 89 | return api_requests 90 | 91 | def create_task(api_request): 92 | '''Creates a Logs API task to generate data''' 93 | url_params = urlencode( 94 | [ 95 | ('date1', api_request.date1_str), 96 | ('date2', api_request.date2_str), 97 | ('source', api_request.user_request.source), 98 | ('fields', ','.join(sorted(api_request.user_request.fields, key=lambda s: s.lower()))) 99 | ] 100 | ) 101 | 102 | url = '{host}/management/v1/counter/{counter_id}/logrequests?'\ 103 | .format(host=HOST, 104 | counter_id=api_request.user_request.counter_id) \ 105 | + url_params 106 | 107 | headers = {'Authorization': 'OAuth ' + api_request.user_request.token} 108 | 109 | r = requests.post(url, headers=headers) 110 | logger.debug(r.text) 111 | if r.status_code == 200: 112 | logger.debug(json.dumps(json.loads(r.text)['log_request'], indent=2)) 113 | response = json.loads(r.text)['log_request'] 114 | api_request.status = response['status'] 115 | api_request.request_id = response['request_id'] 116 | # api_request.size = response['size'] 117 | return response 118 | else: 119 | raise ValueError(r.text) 120 | 121 | 122 | def update_status(api_request): 123 | '''Returns current tasks\'s status''' 124 | url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}' \ 125 | .format(request_id=api_request.request_id, 126 | counter_id=api_request.user_request.counter_id, 127 | host=HOST) 128 | 129 | headers = {'Authorization': 'OAuth ' + api_request.user_request.token} 130 | 131 | r = requests.get(url, headers=headers) 132 | logger.debug(r.text) 133 | if r.status_code == 200: 134 | status = json.loads(r.text)['log_request']['status'] 135 | api_request.status = status 136 | if status == 'processed': 137 | size = len(json.loads(r.text)['log_request']['parts']) 138 | api_request.size = size 139 | return api_request 140 | else: 141 | raise ValueError(r.text) 142 | 143 | 144 | def save_data(api_request, part): 145 | '''Loads data chunk from Logs API and saves to ClickHouse''' 146 | url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download' \ 147 | .format( 148 | host=HOST, 149 | counter_id=api_request.user_request.counter_id, 150 | request_id=api_request.request_id, 151 | part=part 152 | ) 153 | 154 | headers = {'Authorization': 'OAuth ' + api_request.user_request.token} 155 | 156 | r = requests.get(url, headers=headers) 157 | if r.status_code != 200: 158 | logger.debug(r.text) 159 | raise ValueError(r.text) 160 | 161 | 162 | 163 | splitted_text = r.text.split('\n') 164 | logger.info('### DATA SAMPLE') 165 | logger.info('\n'.join(splitted_text[:5])) 166 | 167 | headers_num = len(splitted_text[0].split('\t')) 168 | splitted_text_filtered = list(filter(lambda x: len(x.split('\t')) == headers_num, r.text.split('\n'))) 169 | num_filtered = len(splitted_text) - len(splitted_text_filtered) 170 | if num_filtered != 0: 171 | logger.warning('%d rows were filtered out' % num_filtered) 172 | 173 | if len(splitted_text_filtered) > 1: 174 | output_data = '\n'.join(splitted_text_filtered[1:]) #.encode('utf-8') 175 | output_data = '\t'.join(map(clickhouse.get_ch_field_name, splitted_text_filtered[0].split('\t'))) + '\n' + output_data # convert headers to CH column names 176 | output_data = output_data.replace(r"\'", "'") # to correct escapes in params 177 | 178 | clickhouse.save_data(api_request.user_request.source, 179 | api_request.user_request.fields, 180 | output_data) 181 | else: 182 | logger.warning('### No data to upload') 183 | 184 | api_request.status = 'saved' 185 | 186 | def clean_data(api_request): 187 | '''Cleans generated data on server''' 188 | url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/clean' \ 189 | .format(host=HOST, 190 | counter_id=api_request.user_request.counter_id, 191 | request_id=api_request.request_id) 192 | 193 | headers = {'Authorization': 'OAuth ' + api_request.user_request.token} 194 | 195 | r = requests.post(url, headers=headers) 196 | logger.debug(r.text) 197 | if r.status_code != 200: 198 | raise ValueError(r.text) 199 | 200 | api_request.status = json.loads(r.text)['log_request']['status'] 201 | return json.loads(r.text)['log_request'] 202 | -------------------------------------------------------------------------------- /configs/ch_types.json: -------------------------------------------------------------------------------- 1 | { 2 | "ym:s:counterID": "UInt32", 3 | "ym:s:watchIDs": "Array(UInt64)", 4 | "ym:s:dateTime": "DateTime", 5 | "ym:s:dateTimeUTC": "DateTime", 6 | "ym:s:isNewUser": "UInt8", 7 | "ym:s:startURL": "String", 8 | "ym:s:endURL": "String", 9 | "ym:s:pageViews": "Int32", 10 | "ym:s:visitDuration": "UInt32", 11 | "ym:s:bounce": "UInt8", 12 | "ym:s:ipAddress": "String", 13 | "ym:s:params": "String", 14 | "ym:s:goalsID": "Array(UInt32)", 15 | "ym:s:goalsSerialNumber": "Array(UInt32)", 16 | "ym:s:goalsDateTime": "Array(DateTime)", 17 | "ym:s:goalsPrice": "Array(Int64)", 18 | "ym:s:goalsOrder": "Array(String)", 19 | "ym:s:goalsCurrency": "Array(String)", 20 | "ym:s:clientID": "UInt64", 21 | "ym:s:lastTrafficSource": "String", 22 | "ym:s:lastAdvEngine": "String", 23 | "ym:s:lastReferalSource": "String", 24 | "ym:s:lastSearchEngineRoot": "String", 25 | "ym:s:lastSearchEngine": "String", 26 | "ym:s:lastSocialNetwork": "String", 27 | "ym:s:lastSocialNetworkProfile": "String", 28 | "ym:s:referer": "String", 29 | "ym:s:lastDirectClickOrder": "UInt32", 30 | "ym:s:lastDirectBannerGroup": "UInt32", 31 | "ym:s:lastDirectClickBanner": "String", 32 | "ym:s:lastDirectPhraseOrCond": "String", 33 | "ym:s:lastDirectPlatformType": "String", 34 | "ym:s:lastDirectPlatform": "String", 35 | "ym:s:lastDirectSearchPhrase": "String", 36 | "ym:s:lastDirectConditionType": "String", 37 | "ym:s:lastCurrencyID": "String", 38 | "ym:s:from": "String", 39 | "ym:s:UTMCampaign": "String", 40 | "ym:s:UTMContent": "String", 41 | "ym:s:UTMMedium": "String", 42 | "ym:s:UTMSource": "String", 43 | "ym:s:UTMTerm": "String", 44 | "ym:s:openstatAd": "String", 45 | "ym:s:openstatCampaign": "String", 46 | "ym:s:openstatService": "String", 47 | "ym:s:openstatSource": "String", 48 | "ym:s:hasGCLID": "UInt8", 49 | "ym:s:regionCountry": "String", 50 | "ym:s:regionCity": "String", 51 | "ym:s:browserLanguage": "String", 52 | "ym:s:browserCountry": "String", 53 | "ym:s:clientTimeZone": "Int16", 54 | "ym:s:deviceCategory": "String", 55 | "ym:s:mobilePhone": "String", 56 | "ym:s:mobilePhoneModel": "String", 57 | "ym:s:operatingSystemRoot": "String", 58 | "ym:s:operatingSystem": "String", 59 | "ym:s:browser": "String", 60 | "ym:s:browserMajorVersion": "UInt16", 61 | "ym:s:browserMinorVersion": "UInt16", 62 | "ym:s:browserEngine": "String", 63 | "ym:s:browserEngineVersion1": "UInt16", 64 | "ym:s:browserEngineVersion2": "UInt16", 65 | "ym:s:browserEngineVersion3": "UInt16", 66 | "ym:s:browserEngineVersion4": "UInt16", 67 | "ym:s:cookieEnabled": "UInt8", 68 | "ym:s:javascriptEnabled": "UInt8", 69 | "ym:s:flashMajor": "UInt8", 70 | "ym:s:flashMinor": "UInt8", 71 | "ym:s:screenFormat": "UInt16", 72 | "ym:s:screenColors": "UInt8", 73 | "ym:s:screenOrientation": "String", 74 | "ym:s:screenWidth": "UInt16", 75 | "ym:s:screenHeight": "UInt16", 76 | "ym:s:physicalScreenWidth": "UInt16", 77 | "ym:s:physicalScreenHeight": "UInt16", 78 | "ym:s:windowClientWidth": "UInt16", 79 | "ym:s:windowClientHeight": "UInt16", 80 | "ym:s:purchaseID": "Array(String)", 81 | "ym:s:purchaseDateTime": "Array(DateTime)", 82 | "ym:s:purchaseAffiliation": "Array(String)", 83 | "ym:s:purchaseRevenue": "Array(Float64)", 84 | "ym:s:purchaseTax": "Array(Float64)", 85 | "ym:s:purchaseShipping": "Array(Float64)", 86 | "ym:s:purchaseCoupon": "Array(String)", 87 | "ym:s:purchaseCurrency": "Array(String)", 88 | "ym:s:purchaseProductQuantity": "Array(Int64)", 89 | "ym:s:productsPurchaseID": "Array(String)", 90 | "ym:s:productsID": "Array(String)", 91 | "ym:s:productsName": "Array(String)", 92 | "ym:s:productsBrand": "Array(String)", 93 | "ym:s:productsCategory": "Array(String)", 94 | "ym:s:productsCategory1": "Array(String)", 95 | "ym:s:productsCategory2": "Array(String)", 96 | "ym:s:productsCategory3": "Array(String)", 97 | "ym:s:productsCategory4": "Array(String)", 98 | "ym:s:productsCategory5": "Array(String)", 99 | "ym:s:productsVariant": "Array(String)", 100 | "ym:s:productsPosition": "Array(Int32)", 101 | "ym:s:productsPrice": "Array(Float64)", 102 | "ym:s:productsCurrency": "Array(String)", 103 | "ym:s:productsCoupon": "Array(String)", 104 | "ym:s:productsQuantity": "Array(Int64)", 105 | "ym:s:impressionsURL": "Array(String)", 106 | "ym:s:impressionsDateTime": "Array(DateTime)", 107 | "ym:s:impressionsProductID": "Array(String)", 108 | "ym:s:impressionsProductName": "Array(String)", 109 | "ym:s:impressionsProductBrand": "Array(String)", 110 | "ym:s:impressionsProductCategory": "Array(String)", 111 | "ym:s:impressionsProductCategory1": "Array(String)", 112 | "ym:s:impressionsProductCategory2": "Array(String)", 113 | "ym:s:impressionsProductCategory3": "Array(String)", 114 | "ym:s:impressionsProductCategory4": "Array(String)", 115 | "ym:s:impressionsProductCategory5": "Array(String)", 116 | "ym:s:impressionsProductVariant": "Array(String)", 117 | "ym:s:impressionsProductPrice": "Array(Int64)", 118 | "ym:s:impressionsProductCurrency": "Array(String)", 119 | "ym:s:impressionsProductCoupon": "Array(String)", 120 | "ym:s:lastDirectClickOrderName": "String", 121 | "ym:s:lastClickBannerGroupName": "String", 122 | "ym:s:lastDirectClickBannerName": "String", 123 | "ym:s:networkType": "String", 124 | "ym:s:visitID": "UInt64", 125 | "ym:s:date": "Date", 126 | "ym:s:regionCountryID": "UInt32", 127 | "ym:s:regionCityID": "UInt32", 128 | "ym:s:lastGCLID": "String", 129 | "ym:s:firstGCLID": "String", 130 | "ym:s:lastSignificantGCLID": "String", 131 | "ym:s:offlineCallTalkDuration": "Array(UInt32)", 132 | "ym:s:offlineCallHoldDuration": "Array(UInt32)", 133 | "ym:s:offlineCallMissed": "Array(UInt32)", 134 | "ym:s:offlineCallTag": "Array(String)", 135 | "ym:s:offlineCallFirstTimeCaller": "Array(UInt32)", 136 | "ym:s:offlineCallURL": "Array(String)", 137 | "ym:s:parsedParamsKey1": "Array(String)", 138 | "ym:s:parsedParamsKey2": "Array(String)", 139 | "ym:s:parsedParamsKey3": "Array(String)", 140 | "ym:s:parsedParamsKey4": "Array(String)", 141 | "ym:s:parsedParamsKey5": "Array(String)", 142 | "ym:s:parsedParamsKey6": "Array(String)", 143 | "ym:s:parsedParamsKey7": "Array(String)", 144 | "ym:s:parsedParamsKey8": "Array(String)", 145 | "ym:s:parsedParamsKey9": "Array(String)", 146 | "ym:s:parsedParamsKey10": "Array(String)", 147 | 148 | "ym:pv:watchID": "UInt64", 149 | "ym:pv:counterID": "UInt32", 150 | "ym:pv:dateTime": "DateTime", 151 | "ym:pv:title": "String", 152 | "ym:pv:URL": "String", 153 | "ym:pv:referer": "String", 154 | "ym:pv:UTMCampaign": "String", 155 | "ym:pv:UTMContent": "String", 156 | "ym:pv:UTMMedium": "String", 157 | "ym:pv:UTMSource": "String", 158 | "ym:pv:UTMTerm": "String", 159 | "ym:pv:browser": "String", 160 | "ym:pv:browserMajorVersion": "UInt16", 161 | "ym:pv:browserMinorVersion": "UInt16", 162 | "ym:pv:browserCountry": "String", 163 | "ym:pv:browserEngine": "String", 164 | "ym:pv:browserEngineVersion1": "UInt16", 165 | "ym:pv:browserEngineVersion2": "UInt16", 166 | "ym:pv:browserEngineVersion3": "UInt16", 167 | "ym:pv:browserEngineVersion4": "UInt16", 168 | "ym:pv:browserLanguage": "String", 169 | "ym:pv:clientTimeZone": "Int16", 170 | "ym:pv:cookieEnabled": "UInt8", 171 | "ym:pv:deviceCategory": "String", 172 | "ym:pv:flashMajor": "UInt8", 173 | "ym:pv:flashMinor": "UInt8", 174 | "ym:pv:from": "String", 175 | "ym:pv:hasGCLID": "UInt8", 176 | "ym:pv:ipAddress": "String", 177 | "ym:pv:javascriptEnabled": "UInt8", 178 | "ym:pv:mobilePhone": "String", 179 | "ym:pv:mobilePhoneModel": "String", 180 | "ym:pv:openstatAd": "String", 181 | "ym:pv:openstatCampaign": "String", 182 | "ym:pv:openstatService": "String", 183 | "ym:pv:openstatSource": "String", 184 | "ym:pv:operatingSystem": "String", 185 | "ym:pv:operatingSystemRoot": "String", 186 | "ym:pv:physicalScreenHeight": "UInt16", 187 | "ym:pv:physicalScreenWidth": "UInt16", 188 | "ym:pv:regionCity": "String", 189 | "ym:pv:regionCountry": "String", 190 | "ym:pv:screenColors": "UInt8", 191 | "ym:pv:screenFormat": "UInt16", 192 | "ym:pv:screenHeight": "UInt16", 193 | "ym:pv:screenOrientation": "String", 194 | "ym:pv:screenWidth": "UInt16", 195 | "ym:pv:windowClientHeight": "UInt16", 196 | "ym:pv:windowClientWidth": "UInt16", 197 | "ym:pv:params": "String", 198 | "ym:pv:lastTrafficSource": "String", 199 | "ym:pv:lastSearchEngine": "String", 200 | "ym:pv:lastSearchEngineRoot": "String", 201 | "ym:pv:lastAdvEngine": "String", 202 | "ym:pv:artificial": "UInt8", 203 | "ym:pv:pageCharset": "String", 204 | "ym:pv:link": "UInt8", 205 | "ym:pv:download": "UInt8", 206 | "ym:pv:notBounce": "UInt8", 207 | "ym:pv:event": "UInt8", 208 | "ym:pv:lastSocialNetwork": "String", 209 | "ym:pv:httpError": "String", 210 | "ym:pv:clientID": "UInt64", 211 | "ym:pv:networkType": "String", 212 | "ym:pv:lastSocialNetworkProfile": "String", 213 | "ym:pv:goalsID": "Array(UInt32)", 214 | "ym:pv:shareService": "String", 215 | "ym:pv:shareURL": "String", 216 | "ym:pv:shareTitle": "String", 217 | "ym:pv:iFrame": "UInt8", 218 | "ym:pv:date": "Date", 219 | "ym:pv:GCLID": "String", 220 | "ym:pv:regionCityID": "UInt32", 221 | "ym:pv:regionCountryID": "UInt32", 222 | "ym:pv:isPageView": "UInt8", 223 | "ym:pv:parsedParamsKey1": "Array(String)", 224 | "ym:pv:parsedParamsKey2": "Array(String)", 225 | "ym:pv:parsedParamsKey3": "Array(String)", 226 | "ym:pv:parsedParamsKey4": "Array(String)", 227 | "ym:pv:parsedParamsKey5": "Array(String)", 228 | "ym:pv:parsedParamsKey6": "Array(String)", 229 | "ym:pv:parsedParamsKey7": "Array(String)", 230 | "ym:pv:parsedParamsKey8": "Array(String)", 231 | "ym:pv:parsedParamsKey9": "Array(String)", 232 | "ym:pv:parsedParamsKey10": "Array(String)" 233 | } 234 | --------------------------------------------------------------------------------