├── doc └── Architecture_s.jpg ├── functions ├── requirements.txt ├── local.settings.json ├── CSVDataADXIngestEGFunc │ └── __init__.py ├── JSONConfigADIngestEGFunc │ └── __init__.py ├── CSVFileMergeADXIngestFunc │ └── __init__.py └── ADXIngestMonitorFunc │ └── __init__.py └── README.md /doc/Architecture_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tikyau/ADXAutoFileIngestion/master/doc/Architecture_s.jpg -------------------------------------------------------------------------------- /functions/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-functions 2 | azure-kusto-data 3 | azure-kusto-ingest 4 | azure-cosmos 5 | pandas==0.24.2 6 | azure-storage==0.36.0 7 | applicationinsights 8 | azure-eventgrid -------------------------------------------------------------------------------- /functions/local.settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "IsEncrypted": false, 3 | "Values": { 4 | "AzureWebJobsStorage": "", 5 | "FUNCTIONS_WORKER_RUNTIME": "python", 6 | "FUNCTIONS_EXTENSION_VERSION": "~2", 7 | "APPINSIGHTS_INSTRUMENTATIONKEY": "", 8 | 9 | "COSMOS_URL":"https:/[COSMOS Server].documents.azure.com:443/", 10 | "COSMOS_KEY":"", 11 | "COSMOS_DATABASE":"", 12 | "COSMOS_CONTAINER":"", 13 | 14 | "SOURCE_TELEMETRY_BLOB_ACCOUNT":"", 15 | "SOURCE_TELEMETRY_FILE_BLOB_KEY":"", 16 | 17 | "PROCESSED_TELEMETRY_FOLDER":"telemetr", 18 | 19 | "APP_AAD_TENANT_ID": "", 20 | "APP_CLIENT_ID": "", 21 | "APP_CLIENT_SECRETS": "", 22 | 23 | "APP_INSIGHT_ID": "", 24 | "APP_INSIGHT_MAIN_ERROR_EVENT_NAME":"MONITOR_ADX_ERROR", 25 | "APP_INSIGHT_INGEST_SUCCESS_COUNT_NAME":"INGEST_SUCCESS_COUNT", 26 | "APP_INSIGHT_INGEST_FAILURE_COUNT_NAME":"INGEST_FAILURE_COUNT", 27 | "APP_INSIGHT_INGEST_SUCCESS_EVENT_NAME":"INGEST_JSON_SUCCESS", 28 | "APP_INSIGHT_INGEST_FAILURE_EVENT_NAME":"INGEST_JSON_FAILURE", 29 | 30 | "SUCCESS_STATUS":"SUCCESS", 31 | "FAILURE_STATUS":"FAILURE", 32 | 33 | "LOG_MESSAGE_HEADER": "[ADX-MONITOR]", 34 | "PROCESS_PROGRAM_NAME": "MONITOR_ADX_V001a", 35 | 36 | "EVENT_SUBJECT_FILTER_REGEX": "/_telemetry/", 37 | "DATA_INGESTION_URI": "https://[KUSTO INGESTION].kusto.windows.net:443;Federated Security=True;Application Client Id=;Application Key=", 38 | 39 | "EVENT_GRID_ENDPOINT":"[EVENT GRID ENDPOINT].eventgrid.azure.net", 40 | "EVENT_GRID_KEY":"" 41 | 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # An automatic File Ingestion System for Azure Data Explorer 2 | 3 | This project is an automatic file ingestion system for [Azure Data Explorer (ADX or aka Kusto)](https://docs.microsoft.com/en-us/azure/data-explorer/data-explorer-overview). 4 | 5 | Azure Data Explorer is a fast and highly scalable data exploration service for log and telemetry data. To leverage it's powerful data analysis capablities the very basic step is you will need to ingest data into it. 6 | 7 | The design concept is when you have new data that need to ingest into ADX, you just need to upload the data into Azure Data Lake Gen2 or Azure Blob storage. The system will be triggered and ingests the data into a pre-defined table in ADX. It is useful when you need to continue to ingest log data from other systems 24&7. 8 | 9 | All the ingestion activities and results will be tracked and keep in Application Insight and COSMOS DB. The reason we keep the data in COSMOS DB is it provides an easier way to query and run statistic calculation for the ingestion status (eg. how much files are successfully ingested, which one is successfully ingested) then Application Insight. We also track if the same files (by name) are been ingested twice in COSMOS DB. If something goes wrong, the system will send alert to EventGrid and you will can have other system to get the notification (In our case, we send the alert message to Slack using LogicApp) 10 | 11 | The system leverages Azure Functions and EventGrid to trigger and connect each services. 12 | 13 | Here is an overview architecture: 14 | 15 | ![Overview Architecture](https://github.com/Herman-Wu/ADXAutoFileIngestion/blob/master/doc/Architecture_s.jpg) 16 | 17 | All the codes are implemented in python. 18 | 19 | **Note** 20 | 21 | The code is been extracted from a production project and I removed/changed the code specific related to the project. So some parts of the code will not working and need some fix to get it to work. But that should not be a lot of effort. 22 | 23 | I will try to find time to fix the broken part. 24 | 25 | -------------------------------------------------------------------------------- /functions/CSVDataADXIngestEGFunc/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pandas as pd 4 | import time,datetime 5 | from io import StringIO 6 | from azure.storage.blob import BlockBlobService 7 | from pandas.core.indexes.api import _union_indexes 8 | import json 9 | import codecs 10 | import re 11 | import os,sys 12 | import copy 13 | import io 14 | import zipfile 15 | import re 16 | import uuid 17 | 18 | import azure.functions as func 19 | 20 | from azure.kusto.data.request import KustoClient, KustoConnectionStringBuilder 21 | from azure.kusto.data.exceptions import KustoServiceError 22 | from azure.kusto.data.helpers import dataframe_from_result_table 23 | from azure.kusto.ingest import KustoIngestClient, IngestionProperties, FileDescriptor, BlobDescriptor, DataFormat, ReportLevel, ReportMethod 24 | from azure.kusto.ingest import ( 25 | BlobDescriptor, 26 | IngestionProperties, 27 | DataFormat, 28 | CsvColumnMapping, 29 | JsonColumnMapping, 30 | ReportLevel, 31 | ReportMethod, 32 | ValidationPolicy, 33 | ValidationOptions, 34 | ValidationImplications, 35 | ) 36 | 37 | import azure.cosmos.cosmos_client as cosmos_client 38 | import azure.cosmos.errors as errors 39 | import azure.cosmos.http_constants as http_constant 40 | 41 | from applicationinsights import TelemetryClient 42 | 43 | 44 | # COSMOS CONFIG 45 | COSMOS_URL="https://[COSMOS Server].documents.azure.com:443/" 46 | COSMOS_KEY="" 47 | COSMOS_DATABASE='[COSMOS DATABASE]' 48 | COSMOS_CONTAINER='[]' 49 | 50 | 51 | SOURCE_CSV_BLOB_ACCOUNT='' 52 | SOURCE_CSV_FILE_BLOB_KEY="" 53 | SOURCE_CSV_CONTAINER="" 54 | SOURCE_CSV_FILE_TOKEN="" 55 | 56 | 57 | # ADX CONFIG 58 | APP_AAD_TENANT_ID = "" 59 | APP_CLIENT_ID = '' 60 | APP_CLIENT_SECRETS='' 61 | 62 | DATA_INGESTION_URI = "https://[KUSTO Ingestion ].kusto.windows.net:443;Federated Security=True;Application Client Id="+APP_CLIENT_ID+";Application Key="+APP_CLIENT_SECRETS 63 | DATABASE = '' 64 | DESTINATION_TABLE = "" 65 | 66 | APP_INSIGHT_ID="" 67 | APP_INSIGHT_INGEST_EVENT_NAME="EVENTGRID_INGEST_CSV" 68 | APP_INSIGHT_INGEST_RECORDS_COUNT_NAME="INGEST_CSV_SOURCE_FILES_COUNT" 69 | 70 | LOG_MESSAGE_HEADER="[Ingest Blob EventGrid]" 71 | MAIN_INGESTION_STATUS_COSMOS_DOC_TYPE="INGESTION_EVENTGRID" 72 | PROCESS_PROGRAM_NAME="INGESTION_EVENTGRID_V01A" 73 | 74 | EVENT_SUBJECT_FILTER_REGEX="^((?!data_dsa_telemetry).)*$" 75 | 76 | DESTINATION_TABLE_COLUMN_MAPPING = "CSV_Mapping01A" 77 | IS_FLUSH_IMMEDIATELY=True 78 | 79 | vm_uuid="" 80 | deploy_uuid="" 81 | config_uuid="" 82 | 83 | 84 | def main(event: func.EventGridEvent): 85 | result = json.dumps({ 86 | 'id': event.id, 87 | 'data': event.get_json(), 88 | 'topic': event.topic, 89 | 'subject': event.subject, 90 | 'event_type': event.event_type, 91 | }) 92 | 93 | logging.info('{} Python EventGrid trigger processed an event:{}'.format(LOG_MESSAGE_HEADER ,result)) 94 | 95 | 96 | get_config_values() 97 | regexp = re.compile(EVENT_SUBJECT_FILTER_REGEX) 98 | 99 | if regexp.search(event.subject): # Check if file path match criteria 100 | tc = TelemetryClient(APP_INSIGHT_ID) 101 | tc.context.application.ver = '1.0' 102 | tc.context.properties["PROCESS_PROGRAM"]=PROCESS_PROGRAM_NAME 103 | tc.context.properties["PROCESS_START"]=time.time() 104 | tc.track_trace('{} STRAT RUN EVENTGRID INGEST TELEMETRY JSON DATA from folder {} '.format(LOG_MESSAGE_HEADER,result)) 105 | tc.flush() 106 | 107 | telemetry_block_blob_service = BlockBlobService(account_name=SOURCE_OSMETRICS_BLOB_ACCOUNT, account_key=SOURCE_OSMETRICS_FILE_BLOB_KEY) 108 | filepath=get_file_path(event.subject) 109 | container_name=get_container_name(event.subject) 110 | 111 | logging.info('{} filepath: {}'.format(LOG_MESSAGE_HEADER,filepath)) 112 | logging.info('{} container_name: {}'.format(LOG_MESSAGE_HEADER, container_name)) 113 | ingest_to_ADX(filepath,telemetry_block_blob_service,container_name,SOURCE_OSMETRICS_BLOB_ACCOUNT, tc) 114 | else: 115 | logging.info("{} Subject : {} does not match regular express {}. Skip process. ".format(LOG_MESSAGE_HEADER,event.subject,EVENT_SUBJECT_FILTER_REGEX)) 116 | 117 | 118 | 119 | def get_config_values(): 120 | global COSMOS_URL,COSMOS_KEY,COSMOS_DATABASE,COSMOS_CONTAINER 121 | global SOURCE_OSMETRICS_BLOB_ACCOUNT,SOURCE_OSMETRICS_FILE_BLOB_KEY,SOURCE_OSMETRICS_CONTAINER,SOURCE_OSMETRICS_FILE_TOKEN 122 | global APP_AAD_TENANT_ID,APP_CLIENT_ID,APP_CLIENT_SECRETS,DATA_INGESTION_URI,DATABASE,DESTINATION_TABLE 123 | global APP_INSIGHT_ID,APP_INSIGHT_INGEST_EVENT_NAME,APP_INSIGHT_INGEST_RECORDS_COUNT_NAME 124 | global LOG_MESSAGE_HEADER,MAIN_INGESTION_STATUS_COSMOS_DOC_TYPE,PROCESS_PROGRAM_NAME,EVENT_SUBJECT_FILTER_REGEX 125 | global DESTINATION_TABLE_COLUMN_MAPPING,IS_FLUSH_IMMEDIATELY 126 | 127 | # COSMOS CONFIG 128 | COSMOS_URL= os.getenv("COSMOS_URL",COSMOS_URL) 129 | COSMOS_KEY= os.getenv("COSMOS_KEY",COSMOS_KEY) 130 | COSMOS_DATABASE=os.getenv("COSMOS_DATABASE",COSMOS_DATABASE) 131 | COSMOS_CONTAINER=os.getenv("COSMOS_CONTAINER",COSMOS_CONTAINER) 132 | 133 | 134 | SOURCE_OSMETRICS_BLOB_ACCOUNT=os.getenv("SOURCE_OSMETRICS_BLOB_ACCOUNT",SOURCE_OSMETRICS_BLOB_ACCOUNT) 135 | SOURCE_OSMETRICS_FILE_BLOB_KEY=os.getenv("SOURCE_OSMETRICS_FILE_BLOB_KEY",SOURCE_OSMETRICS_FILE_BLOB_KEY) 136 | SOURCE_OSMETRICS_CONTAINER=os.getenv("SOURCE_OSMETRICS_CONTAINER",SOURCE_OSMETRICS_CONTAINER) 137 | SOURCE_OSMETRICS_FILE_TOKEN=os.getenv("SOURCE_OSMETRICS_FILE_TOKEN",SOURCE_OSMETRICS_FILE_TOKEN) 138 | 139 | # ADX CONFIG 140 | APP_AAD_TENANT_ID = os.getenv("APP_AAD_TENANT_ID",APP_AAD_TENANT_ID) 141 | APP_CLIENT_ID = os.getenv("APP_CLIENT_ID",APP_CLIENT_ID) 142 | APP_CLIENT_SECRETS=os.getenv("APP_CLIENT_SECRETS",APP_CLIENT_SECRETS) 143 | DATA_INGESTION_URI =os.getenv("DATA_INGESTION_URI",DATA_INGESTION_URI) 144 | DATABASE = os.getenv("DATABASE",DATABASE) 145 | DESTINATION_TABLE = os.getenv("DESTINATION_TABLE",DESTINATION_TABLE) 146 | 147 | APP_INSIGHT_ID=os.getenv("APP_INSIGHT_ID",APP_INSIGHT_ID) 148 | APP_INSIGHT_INGEST_EVENT_NAME=os.getenv("APP_INSIGHT_INGEST_EVENT_NAME",APP_INSIGHT_INGEST_EVENT_NAME) 149 | APP_INSIGHT_INGEST_RECORDS_COUNT_NAME=os.getenv("APP_INSIGHT_INGEST_RECORDS_COUNT_NAME",APP_INSIGHT_INGEST_RECORDS_COUNT_NAME) 150 | 151 | LOG_MESSAGE_HEADER=os.getenv("LOG_MESSAGE_HEADER",LOG_MESSAGE_HEADER) 152 | 153 | MAIN_INGESTION_STATUS_COSMOS_DOC_TYPE=os.getenv("MAIN_INGESTION_STATUS_COSMOS_DOC_TYPE",MAIN_INGESTION_STATUS_COSMOS_DOC_TYPE) 154 | PROCESS_PROGRAM_NAME=os.getenv("PROCESS_PROGRAM_NAME",PROCESS_PROGRAM_NAME) 155 | 156 | EVENT_SUBJECT_FILTER_REGEX=os.getenv("EVENT_SUBJECT_FILTER_REGEX",EVENT_SUBJECT_FILTER_REGEX) 157 | DESTINATION_TABLE_COLUMN_MAPPING=os.getenv("DESTINATION_TABLE_COLUMN_MAPPING",DESTINATION_TABLE_COLUMN_MAPPING) 158 | IS_FLUSH_IMMEDIATELY=bool(os.getenv("IS_FLUSH_IMMEDIATELY",IS_FLUSH_IMMEDIATELY)) 159 | #logging.info(f'My app setting value:{my_app_setting_value}') 160 | 161 | 162 | def get_container_name(fullfilepath): 163 | frefixword='/containers/' 164 | suffixword='/blobs/' 165 | return fullfilepath[fullfilepath.find(frefixword)+len(frefixword):fullfilepath.find(suffixword)] 166 | 167 | def get_file_path(fullfilepath): 168 | frefixword='/blobs/' 169 | return fullfilepath[fullfilepath.find(frefixword)+len(frefixword):] 170 | 171 | def get_uuids_from_csv(telemetry_block_blob_service,container_name,filepath): 172 | with io.BytesIO() as output_blob: 173 | telemetry_block_blob_service.get_blob_to_stream(container_name, filepath, output_blob) 174 | z = zipfile.ZipFile(output_blob) 175 | foo2 = z.open(z.infolist()[0]) 176 | lines= foo2.readlines() 177 | content="\n".join(line.strip().decode("utf-8") for line in lines) 178 | osmetric_df = pd.read_csv(io.StringIO(content)) 179 | osmetric_df.sort_values('Universal_datetime') 180 | osmetric_df['dataTimestamp']=pd.to_datetime(osmetric_df['Universal_datetime']).values.astype(int) / 10**6 181 | record_num=len( osmetric_df.index) 182 | print(LOG_MESSAGE_HEADER+ " : Total Doc # : "+str(len( osmetric_df.index))+osmetric_df['Universal_datetime'][0]+osmetric_df['Universal_datetime'][record_num-1]) 183 | print(LOG_MESSAGE_HEADER+ " : Total Doc # : "+str(len( osmetric_df.index))+' '+str(osmetric_df['dataTimestamp'][0])+' '+str(osmetric_df['dataTimestamp'][record_num-1])) 184 | 185 | return osmetric_df['vm_uuid'][0],osmetric_df['config_uuid'][0],osmetric_df['deploy_uuid'][0],len(content),osmetric_df['dataTimestamp'][0],osmetric_df['dataTimestamp'][record_num-1],record_num 186 | 187 | #def save_COSMOS_log(vm_uuid,deploy_uuid,config_uuid,file_path,min_datatime,max_datatime, total_records,blob_container_id ,tc): 188 | def save_COSMOS_log(vm_uuid,deploy_uuid,config_uuid,file_path,min_datatime,max_datatime, total_records,ingest_source_id,blob_account,blob_container_name, tc): 189 | 190 | url=COSMOS_URL 191 | key=COSMOS_KEY 192 | client = cosmos_client.CosmosClient(url, {'masterKey': key}) 193 | database_id=COSMOS_DATABASE 194 | container_id=COSMOS_CONTAINER 195 | 196 | database_link = 'dbs/' + database_id 197 | collection_link = database_link + '/colls/' + container_id 198 | 199 | filename=get_filename(file_path) 200 | #doc_id=vm_uuid+'_'+deploy_uuid+'_'+config_uuid+'_'+filename 201 | doc_id=get_doc_id(blob_container_name,file_path) 202 | print (doc_id) 203 | 204 | doc_link = collection_link + '/docs/' + doc_id 205 | options = {} 206 | options['enableCrossPartitionQuery'] = True 207 | options['maxItemCount'] = 5 208 | options['partitionKey'] = vm_uuid 209 | 210 | win_telemetry_info=None 211 | try: 212 | win_telemetry_info = client.ReadItem(doc_link,options) 213 | except: 214 | print("New Process Log Doc") 215 | 216 | if(win_telemetry_info is not None): 217 | print ("Find Existing Process Log Doc ") 218 | else: # New process log 219 | win_telemetry_info={} 220 | win_telemetry_info["id"]=doc_id 221 | win_telemetry_info["process_type"]=PROCESS_PROGRAM_NAME 222 | win_telemetry_info["DOC_TYPE"]=MAIN_INGESTION_STATUS_COSMOS_DOC_TYPE #DOC_TYPE 223 | win_telemetry_info["file_path"]=file_path 224 | win_telemetry_info["ingestions"]=[] 225 | win_telemetry_info["blob_account"]=blob_account 226 | win_telemetry_info["blob_container_name"]=blob_container_name 227 | 228 | ingestion_info={} 229 | ingestion_info["source_id"]=ingest_source_id 230 | ingestion_info["ingest_trigger_time"]=time.time() 231 | ingestion_info["min_datatime"]=min_datatime 232 | ingestion_info["max_datatime"]=max_datatime 233 | ingestion_info["total_records"]=total_records 234 | ingestion_info["status"]='PENDING' 235 | 236 | 237 | win_telemetry_info["ingestions"].append(ingestion_info) 238 | ingestion_info['LATEST_UPDATE_TIMESTAMP']=time.time() 239 | 240 | tc.track_metric(APP_INSIGHT_INGEST_RECORDS_COUNT_NAME, total_records) 241 | tc.flush() 242 | 243 | client.UpsertItem(collection_link,win_telemetry_info,options) 244 | return doc_id 245 | 246 | def ingest_to_ADX(filepath, telemetry_block_blob_service, container_name,blob_account, tc): 247 | ingest_source_id=str(uuid.uuid4()) 248 | #file_size=BlockBlobService.get_blob_properties(telemetry_block_blob_service,container_name,filepath).properties.content_length 249 | #print (filepath+" File Size "+str(file_size)) 250 | 251 | KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(DATA_INGESTION_URI) 252 | KCSB_INGEST.authority_id = APP_AAD_TENANT_ID 253 | 254 | vm_uuid,config_uuid,deploy_uuid,file_size, min_datatime, max_datatime, total_records=get_uuids_from_csv(telemetry_block_blob_service,container_name,filepath) 255 | dropByTag= vm_uuid+'_'+config_uuid+'_'+deploy_uuid 256 | 257 | INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) 258 | INGESTION_PROPERTIES = IngestionProperties(database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.CSV, mappingReference=DESTINATION_TABLE_COLUMN_MAPPING, additionalProperties={'ignoreFirstRecord': 'true','reportMethod':'QueueAndTable'}, reportLevel=ReportLevel.FailuresAndSuccesses, dropByTags=[dropByTag],flushImmediately=IS_FLUSH_IMMEDIATELY) 259 | 260 | BLOB_PATH = "https://" + SOURCE_OSMETRICS_BLOB_ACCOUNT + ".blob.core.windows.net/" + SOURCE_OSMETRICS_CONTAINER + "/" + filepath + SOURCE_OSMETRICS_FILE_TOKEN 261 | #print (BLOB_PATH,' ',str(file_size)) 262 | BLOB_DESCRIPTOR = BlobDescriptor(BLOB_PATH, file_size, ingest_source_id) # 10 is the raw size of the data in bytes 263 | 264 | INGESTION_CLIENT.ingest_from_blob(BLOB_DESCRIPTOR,ingestion_properties=INGESTION_PROPERTIES) 265 | 266 | 267 | 268 | tc.context.properties["ingest_source_id"]=str(ingest_source_id) 269 | 270 | doc_id=save_COSMOS_log(vm_uuid,deploy_uuid,config_uuid,filepath,min_datatime,max_datatime, total_records,ingest_source_id,blob_account,container_name, tc) 271 | 272 | tc.track_event(APP_INSIGHT_INGEST_EVENT_NAME, { 'FILE_PATH': filepath,'DOC_ID':doc_id,"SOURCE_ID":ingest_source_id }, { 'TOTOAL_RECORDS': total_records, 'FILE_SIZE':file_size,'MIN_DATETIME':min_datatime,'MAX_DATETIME': max_datatime }) 273 | log_msg="{} Done queuing up ingestion with Azure Data Explorer {}, Ingest SourceID {}".format(LOG_MESSAGE_HEADER,filepath,ingest_source_id) 274 | print(log_msg) 275 | tc.track_trace(log_msg) 276 | tc.flush() 277 | 278 | def get_container_name(fullfilepath): 279 | frefixword='/containers/' 280 | suffixword='/blobs/' 281 | return fullfilepath[fullfilepath.find(frefixword)+len(frefixword):fullfilepath.find(suffixword)] 282 | 283 | def get_file_path(fullfilepath): 284 | frefixword='/blobs/' 285 | return fullfilepath[fullfilepath.find(frefixword)+len(frefixword):] 286 | 287 | def get_filename(filepath): 288 | path_secs=filepath.split('/') 289 | filename=path_secs[len(path_secs)-1] 290 | return filename 291 | 292 | 293 | 294 | def get_doc_id(container_id,file_path): 295 | file_path=file_path[len(file_path)-min(len(file_path),200):] # COSMOSDB Can only acceppet doc id length less than 255 characters 296 | return (container_id+'_'+file_path).replace('/','_').replace('\\','_').replace('?','_').replace('#','_') 297 | -------------------------------------------------------------------------------- /functions/JSONConfigADIngestEGFunc/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pandas as pd 4 | import time,datetime 5 | from io import StringIO 6 | from azure.storage.blob import BlockBlobService 7 | from pandas.core.indexes.api import _union_indexes 8 | import json 9 | import codecs 10 | import re 11 | import os,sys 12 | import copy 13 | import io 14 | import zipfile 15 | import re 16 | import uuid 17 | 18 | import azure.functions as func 19 | 20 | from azure.kusto.data.request import KustoClient, KustoConnectionStringBuilder 21 | from azure.kusto.data.exceptions import KustoServiceError 22 | from azure.kusto.data.helpers import dataframe_from_result_table 23 | from azure.kusto.ingest import KustoIngestClient, IngestionProperties, FileDescriptor, BlobDescriptor, DataFormat, ReportLevel, ReportMethod 24 | from azure.kusto.ingest import ( 25 | BlobDescriptor, 26 | IngestionProperties, 27 | DataFormat, 28 | CsvColumnMapping, 29 | JsonColumnMapping, 30 | ReportLevel, 31 | ReportMethod, 32 | ValidationPolicy, 33 | ValidationOptions, 34 | ValidationImplications, 35 | ) 36 | 37 | import azure.cosmos.cosmos_client as cosmos_client 38 | import azure.cosmos.errors as errors 39 | import azure.cosmos.http_constants as http_constant 40 | 41 | from applicationinsights import TelemetryClient 42 | 43 | 44 | # COSMOS CONFIG 45 | COSMOS_URL="https://[COSMOS Server].documents.azure.com:443/" 46 | COSMOS_KEY="" 47 | COSMOS_DATABASE='[COSMOS DATABASE]' 48 | COSMOS_CONTAINER='[]' 49 | 50 | 51 | CONFIG_FILE_BLOB_ACCOUNT="" 52 | CONFIG_FILE_CONTAINER = "" 53 | CONFIG_FILE_TOKEN = "" 54 | CONFIG_FILE_BLOB_KEY="" 55 | 56 | CLEAN_FILE_BLOB_ACCOUNT="" 57 | CLEAN_FILE_CONTAINER = "" 58 | CLEAN_FILE_TOKEN = "" 59 | CLEAN_FILE_BLOB_KEY="" 60 | 61 | 62 | # ADX CONFIG 63 | APP_AAD_TENANT_ID = "[ADX Service Priciple TENANT ID]" 64 | APP_CLIENT_ID = '[ADX Service Priciple Client ID]' 65 | APP_CLIENT_SECRETS='[ADX Service Priciple Secrets]' 66 | 67 | DATA_INGESTION_URI = "https://[KUSTO Ingestion ].kusto.windows.net:443;Federated Security=True;Application Client Id="+APP_CLIENT_ID+";Application Key="+APP_CLIENT_SECRETS 68 | 69 | DATABASE = '' 70 | DESTINATION_TABLE = "" 71 | 72 | APP_INSIGHT_ID="" 73 | APP_INSIGHT_INGEST_EVENT_NAME="EVENTGRID_INGEST_CONFIG_JSON" 74 | APP_INSIGHT_INGEST_RECORDS_COUNT_NAME="INGEST_CONFIG_JSON_SOURCE_FILES_COUNT" 75 | 76 | LOG_MESSAGE_HEADER="[Config-Ingest Blob EventGrid]" 77 | MAIN_INGESTION_STATUS_COSMOS_DOC_TYPE="CONFIG_INGESTION_EVENTGRID" 78 | PROCESS_PROGRAM_NAME="CONFIG_INGESTION_EVENTGRID_V01A" 79 | 80 | EVENT_SUBJECT_FILTER_REGEX="^((?!_telemetry).)*$" 81 | IS_FLUSH_IMMEDIATELY=True #True or False 82 | 83 | vm_uuid="" 84 | deploy_uuid="" 85 | config_uuid="" 86 | 87 | 88 | def main(event: func.EventGridEvent): 89 | result = json.dumps({ 90 | 'id': event.id, 91 | 'data': event.get_json(), 92 | 'topic': event.topic, 93 | 'subject': event.subject, 94 | 'event_type': event.event_type, 95 | }) 96 | 97 | logging.info('{} Python EventGrid trigger processed an event:{}'.format(LOG_MESSAGE_HEADER ,result)) 98 | 99 | get_config_values() 100 | regexp = re.compile(EVENT_SUBJECT_FILTER_REGEX) 101 | 102 | if regexp.search(event.subject): # Check if file path match criteria 103 | tc = TelemetryClient(APP_INSIGHT_ID) 104 | tc.context.application.ver = '1.0' 105 | tc.context.properties["PROCESS_PROGRAM"]=PROCESS_PROGRAM_NAME 106 | tc.context.properties["PROCESS_START"]=time.time() 107 | tc.track_trace('{} STRAT RUN EVENTGRID INGEST TELEMETRY JSON DATA from folder {} '.format(LOG_MESSAGE_HEADER,result)) 108 | tc.flush() 109 | 110 | configfile_block_blob_service = BlockBlobService(account_name=CONFIG_FILE_BLOB_ACCOUNT, account_key=CONFIG_FILE_BLOB_KEY) 111 | cleanfile_block_blob_service = BlockBlobService(account_name=CLEAN_FILE_BLOB_ACCOUNT, account_key=CLEAN_FILE_BLOB_KEY) 112 | filepath=get_file_path(event.subject) 113 | container_name=get_container_name(event.subject) 114 | config_file_name=get_filename(filepath) 115 | logging.info('{} filepath: {}'.format(LOG_MESSAGE_HEADER,filepath)) 116 | logging.info('{} container_name: {}'.format(LOG_MESSAGE_HEADER, container_name)) 117 | 118 | config_file_name_utf8, filesize,vm_uuid,deploy_uuid,config_uuid=generate_UTF8_config_json(configfile_block_blob_service,CONFIG_FILE_CONTAINER,filepath,cleanfile_block_blob_service,CLEAN_FILE_CONTAINER ) 119 | ingest_to_ADX(config_file_name_utf8, cleanfile_block_blob_service, CLEAN_FILE_CONTAINER, CLEAN_FILE_BLOB_ACCOUNT,filesize,tc,vm_uuid,deploy_uuid,config_uuid) 120 | 121 | else: 122 | logging.info("{} Subject : {} does not match regular express {}. Skip process. ".format(LOG_MESSAGE_HEADER,event.subject,EVENT_SUBJECT_FILTER_REGEX)) 123 | 124 | 125 | 126 | def get_config_values(): 127 | global COSMOS_URL,COSMOS_KEY,COSMOS_DATABASE,COSMOS_CONTAINER 128 | global CONFIG_FILE_BLOB_ACCOUNT,CONFIG_FILE_BLOB_KEY,CONFIG_FILE_CONTAINER,CONFIG_FILE_TOKEN,CLEAN_FILE_BLOB_ACCOUNT,CLEAN_FILE_CONTAINER,CLEAN_FILE_TOKEN,CLEAN_FILE_BLOB_KEY 129 | global APP_AAD_TENANT_ID,APP_CLIENT_ID,APP_CLIENT_SECRETS,DATA_INGESTION_URI,DATABASE,DESTINATION_TABLE 130 | global APP_INSIGHT_ID,APP_INSIGHT_INGEST_EVENT_NAME,APP_INSIGHT_INGEST_RECORDS_COUNT_NAME 131 | global LOG_MESSAGE_HEADER,MAIN_INGESTION_STATUS_COSMOS_DOC_TYPE,PROCESS_PROGRAM_NAME,EVENT_SUBJECT_FILTER_REGEX 132 | global IS_FLUSH_IMMEDIATELY 133 | 134 | # COSMOS CONFIG 135 | COSMOS_URL= os.getenv("COSMOS_URL",COSMOS_URL) 136 | COSMOS_KEY= os.getenv("COSMOS_KEY",COSMOS_KEY) 137 | COSMOS_DATABASE=os.getenv("COSMOS_DATABASE",COSMOS_DATABASE) 138 | COSMOS_CONTAINER=os.getenv("COSMOS_CONTAINER",COSMOS_CONTAINER) 139 | 140 | CONFIG_FILE_BLOB_ACCOUNT=os.getenv("CONFIG_FILE_BLOB_ACCOUNT",CONFIG_FILE_BLOB_ACCOUNT) 141 | CONFIG_FILE_BLOB_KEY=os.getenv("CONFIG_FILE_BLOB_KEY",CONFIG_FILE_BLOB_KEY) 142 | CONFIG_FILE_CONTAINER=os.getenv("CONFIG_FILE_CONTAINER",CONFIG_FILE_CONTAINER) 143 | CONFIG_FILE_TOKEN=os.getenv("CONFIG_FILE_TOKEN",CONFIG_FILE_TOKEN) 144 | 145 | CLEAN_FILE_BLOB_ACCOUNT=os.getenv("CLEAN_FILE_BLOB_ACCOUNT",CLEAN_FILE_BLOB_ACCOUNT) 146 | CLEAN_FILE_CONTAINER=os.getenv("CLEAN_FILE_CONTAINER",CLEAN_FILE_CONTAINER) 147 | CLEAN_FILE_TOKEN=os.getenv("CLEAN_FILE_TOKEN",CLEAN_FILE_TOKEN) 148 | CLEAN_FILE_BLOB_KEY=os.getenv("CLEAN_FILE_BLOB_KEY",CLEAN_FILE_BLOB_KEY) 149 | 150 | 151 | # ADX CONFIG 152 | APP_AAD_TENANT_ID = os.getenv("APP_AAD_TENANT_ID",APP_AAD_TENANT_ID) 153 | APP_CLIENT_ID = os.getenv("APP_CLIENT_ID",APP_CLIENT_ID) 154 | APP_CLIENT_SECRETS=os.getenv("APP_CLIENT_SECRETS",APP_CLIENT_SECRETS) 155 | DATA_INGESTION_URI =os.getenv("DATA_INGESTION_URI",DATA_INGESTION_URI) 156 | DATABASE = os.getenv("DATABASE",DATABASE) 157 | DESTINATION_TABLE = os.getenv("DESTINATION_TABLE",DESTINATION_TABLE) 158 | 159 | APP_INSIGHT_ID=os.getenv("APP_INSIGHT_ID",APP_INSIGHT_ID) 160 | APP_INSIGHT_INGEST_EVENT_NAME=os.getenv("APP_INSIGHT_INGEST_EVENT_NAME",APP_INSIGHT_INGEST_EVENT_NAME) 161 | APP_INSIGHT_INGEST_RECORDS_COUNT_NAME=os.getenv("APP_INSIGHT_INGEST_RECORDS_COUNT_NAME",APP_INSIGHT_INGEST_RECORDS_COUNT_NAME) 162 | 163 | LOG_MESSAGE_HEADER=os.getenv("LOG_MESSAGE_HEADER",LOG_MESSAGE_HEADER) 164 | 165 | MAIN_INGESTION_STATUS_COSMOS_DOC_TYPE=os.getenv("MAIN_INGESTION_STATUS_COSMOS_DOC_TYPE",MAIN_INGESTION_STATUS_COSMOS_DOC_TYPE) 166 | PROCESS_PROGRAM_NAME=os.getenv("PROCESS_PROGRAM_NAME",PROCESS_PROGRAM_NAME) 167 | 168 | EVENT_SUBJECT_FILTER_REGEX=os.getenv("EVENT_SUBJECT_FILTER_REGEX",EVENT_SUBJECT_FILTER_REGEX) 169 | IS_FLUSH_IMMEDIATELY=bool(os.getenv("IS_FLUSH_IMMEDIATELY",IS_FLUSH_IMMEDIATELY)) 170 | 171 | #logging.info(f'My app setting value:{my_app_setting_value}') 172 | 173 | 174 | 175 | def ingest_to_ADX(filepath, telemetry_block_blob_service, container_name, blob_account, file_size, tc,vm_uuid,deploy_uuid,config_uuid): 176 | ingest_source_id=str(uuid.uuid4()) 177 | KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(DATA_INGESTION_URI) 178 | KCSB_INGEST.authority_id = APP_AAD_TENANT_ID 179 | INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) 180 | ing_map=[JsonColumnMapping("vm_uuid", "$.vm_uuid", "string"), 181 | JsonColumnMapping("deploy_uuid", "$.deployment_description[0].deploy_uuid", "string"), 182 | JsonColumnMapping("config_uuid", "$.vm_configuration[0].config_uuid", "string"), 183 | JsonColumnMapping("rawdata", "$", "dynamic")] 184 | 185 | INGESTION_PROPERTIES = IngestionProperties(database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.JSON, ingestionMapping=ing_map, reportLevel=ReportLevel.FailuresAndSuccesses,flushImmediately=IS_FLUSH_IMMEDIATELY) 186 | 187 | print("Database {} Tabele {}".format(DATABASE,DESTINATION_TABLE)) 188 | 189 | BLOB_PATH = "https://" + blob_account + ".blob.core.windows.net/" + container_name + "/" + filepath + CLEAN_FILE_TOKEN 190 | 191 | print (BLOB_PATH,' ',str(file_size), ingest_source_id) 192 | BLOB_DESCRIPTOR = BlobDescriptor(BLOB_PATH, file_size, ingest_source_id) # 10 is the raw size of the data in bytes 193 | INGESTION_CLIENT.ingest_from_blob(BLOB_DESCRIPTOR,ingestion_properties=INGESTION_PROPERTIES) 194 | tc.context.properties["ingest_source_id"]=ingest_source_id 195 | 196 | min_datatime=0 197 | max_datatime=0 198 | total_records=1 199 | 200 | doc_id=save_COSMOS_log(vm_uuid,deploy_uuid,config_uuid,filepath,min_datatime,max_datatime, total_records,ingest_source_id,blob_account,container_name, tc) 201 | 202 | tc.track_event(APP_INSIGHT_INGEST_EVENT_NAME, { 'FILE_PATH': filepath,'DOC_ID':doc_id,"SOURCE_ID":ingest_source_id }, { 'TOTOAL_RECORDS': total_records, 'FILE_SIZE':file_size,'MIN_DATETIME':min_datatime,'MAX_DATETIME': max_datatime }) 203 | log_msg="{} Done queuing up ingestion with Azure Data Explorer {}, Ingest SourceID {}".format(LOG_MESSAGE_HEADER,filepath,ingest_source_id) 204 | print(log_msg) 205 | tc.track_trace(log_msg) 206 | tc.flush() 207 | 208 | 209 | 210 | #def save_COSMOS_log(vm_uuid,deploy_uuid,config_uuid,file_path,min_datatime,max_datatime, total_records,blob_conatiner_name ,tc): 211 | def save_COSMOS_log(vm_uuid,deploy_uuid,config_uuid,file_path,min_datatime,max_datatime, total_records,ingest_source_id,blob_account,blob_container_name, tc): 212 | 213 | url=COSMOS_URL 214 | key=COSMOS_KEY 215 | client = cosmos_client.CosmosClient(url, {'masterKey': key}) 216 | database_id=COSMOS_DATABASE 217 | container_id=COSMOS_CONTAINER 218 | 219 | database_link = 'dbs/' + database_id 220 | collection_link = database_link + '/colls/' + container_id 221 | 222 | #doc_id=vm_uuid+'_'+deploy_uuid+'_'+config_uuid+'_'+filename 223 | doc_id=get_doc_id(blob_container_name,file_path) 224 | print (doc_id) 225 | 226 | doc_link = collection_link + '/docs/' + doc_id 227 | options = {} 228 | options['enableCrossPartitionQuery'] = True 229 | options['maxItemCount'] = 5 230 | options['partitionKey'] = vm_uuid 231 | 232 | win_telemetry_info=None 233 | try: 234 | win_telemetry_info = client.ReadItem(doc_link,options) 235 | except: 236 | print("New Process Log Doc") 237 | 238 | if(win_telemetry_info is not None): 239 | print ("Find Existing Process Log Doc ") 240 | else: # New process log 241 | win_telemetry_info={} 242 | win_telemetry_info["id"]=doc_id 243 | win_telemetry_info["process_type"]=PROCESS_PROGRAM_NAME 244 | win_telemetry_info["DOC_TYPE"]=MAIN_INGESTION_STATUS_COSMOS_DOC_TYPE #DOC_TYPE 245 | win_telemetry_info["file_path"]=file_path 246 | win_telemetry_info["blob_account"]=blob_account 247 | win_telemetry_info["blob_container_name"]=blob_container_name 248 | win_telemetry_info["ingestions"]=[] 249 | 250 | ingestion_info={} 251 | ingestion_info["source_id"]=ingest_source_id 252 | ingestion_info["ingest_trigger_time"]=time.time() 253 | ingestion_info["min_datatime"]=min_datatime 254 | ingestion_info["max_datatime"]=max_datatime 255 | ingestion_info["total_records"]=total_records 256 | ingestion_info["status"]='PENDING' 257 | 258 | 259 | win_telemetry_info["ingestions"].append(ingestion_info) 260 | ingestion_info['LATEST_UPDATE_TIMESTAMP']=time.time() 261 | 262 | tc.track_metric(APP_INSIGHT_INGEST_RECORDS_COUNT_NAME, total_records) 263 | tc.flush() 264 | 265 | client.UpsertItem(collection_link,win_telemetry_info,options) 266 | return doc_id 267 | 268 | 269 | def generate_UTF8_config_json(configfile_block_blob_service,config_file_container,filepath,cleanfile_block_blob_service,clean_file_container ): 270 | 271 | print ("{} Read Config file container {} filepath {}".format(LOG_MESSAGE_HEADER,config_file_container,filepath)) 272 | configstring = configfile_block_blob_service.get_blob_to_text(config_file_container, filepath, encoding='utf-8').content 273 | #print (configstring) 274 | #decoded_data=codecs.decode(configstring.encode(), 'utf-8-sig') 275 | config_obj=json.loads(configstring) 276 | 277 | 278 | print("{} Read Config file size {} ".format(LOG_MESSAGE_HEADER,str(len(config_obj)))) 279 | filesize=len( json.dumps(config_obj)) 280 | clean_file_path=filepath.replace('.json','_ingest.json') 281 | cleanfile_block_blob_service.create_blob_from_text(clean_file_container,clean_file_path, json.dumps(config_obj)) 282 | print ("{} Write config to container {}, file path {}".format(LOG_MESSAGE_HEADER,clean_file_path,clean_file_path)) 283 | return clean_file_path, filesize, vm_uuid,deploy_uuid,config_uuid 284 | 285 | def get_container_name(fullfilepath): 286 | frefixword='/containers/' 287 | suffixword='/blobs/' 288 | return fullfilepath[fullfilepath.find(frefixword)+len(frefixword):fullfilepath.find(suffixword)] 289 | 290 | def get_file_path(fullfilepath): 291 | frefixword='/blobs/' 292 | return fullfilepath[fullfilepath.find(frefixword)+len(frefixword):] 293 | 294 | 295 | def get_filename(filepath): 296 | path_secs=filepath.split('/') 297 | filename=path_secs[len(path_secs)-1] 298 | return filename 299 | 300 | 301 | def get_doc_id(container_id,file_path): 302 | file_path=file_path[len(file_path)-min(len(file_path),200):] # COSMOSDB Can only acceppet doc id length less than 255 characters 303 | return (container_id+'_'+file_path).replace('/','_').replace('\\','_').replace('?','_').replace('#','_') 304 | -------------------------------------------------------------------------------- /functions/CSVFileMergeADXIngestFunc/__init__.py: -------------------------------------------------------------------------------- 1 | import azure.functions as func 2 | 3 | import logging 4 | 5 | import pandas as pd 6 | import time 7 | import datetime 8 | from io import StringIO 9 | from pandas.core.indexes.api import _union_indexes 10 | import json 11 | from io import StringIO 12 | import codecs 13 | import re 14 | import os,sys 15 | import copy 16 | import traceback 17 | 18 | from azure.storage.blob import BlockBlobService 19 | 20 | from azure.kusto.data.request import KustoClient, KustoConnectionStringBuilder 21 | from azure.kusto.data.exceptions import KustoServiceError 22 | from azure.kusto.data.helpers import dataframe_from_result_table 23 | from azure.kusto.ingest import KustoIngestClient, IngestionProperties, FileDescriptor, BlobDescriptor, DataFormat, ReportLevel, ReportMethod 24 | from azure.kusto.ingest import ( 25 | BlobDescriptor, 26 | IngestionProperties, 27 | DataFormat, 28 | CsvColumnMapping, 29 | JsonColumnMapping, 30 | ReportLevel, 31 | ReportMethod, 32 | ValidationPolicy, 33 | ValidationOptions, 34 | ValidationImplications, 35 | ) 36 | 37 | import azure.cosmos.cosmos_client as cosmos_client 38 | import azure.cosmos.errors as errors 39 | import azure.cosmos.http_constants as http_constant 40 | 41 | # COSMOS CONFIG 42 | COSMOS_URL="https://[COSMOS Server].documents.azure.com:443/" 43 | COSMOS_KEY="" 44 | COSMOS_DATABASE='[COSMOS DATABASE]' 45 | COSMOS_CONTAINER='[]' 46 | 47 | 48 | # ADX CONFIG 49 | APP_AAD_TENANT_ID = "[ADX Service Priciple TENANT ID]" 50 | APP_CLIENT_ID = '[ADX Service Priciple Client ID]' 51 | APP_CLIENT_SECRETS='[ADX Service Priciple Secrets]' 52 | DATA_INGESTION_URI = "https://[KUSTO Ingestion ].kusto.windows.net:443;Federated Security=True;Application Client Id="+APP_CLIENT_ID+";Application Key="+APP_CLIENT_SECRETS 53 | 54 | DESTINATION_TABLE = "" 55 | DESTINATION_TABLE_COLUMN_MAPPING = "" 56 | 57 | 58 | CONFIG_FILE_BLOB_ACCOUNT="" 59 | CONFIG_FILE_CONTAINER = "" 60 | CONFIG_FILE_TOKEN = "" 61 | 62 | 63 | SOURCE_CSV_BLOB_ACCOUNT='' 64 | SOURCE_CSV_BLOB_TOKEN='' 65 | SOURCE_CSV_CONTAINER="" 66 | SOURCE_CSV_BLOB_KEY="" 67 | 68 | from applicationinsights import TelemetryClient 69 | 70 | FILE_OUTPUT_FOLDER="" 71 | vm_uuid="" 72 | deploy_uuid="" 73 | config_uuid="" 74 | 75 | AGGREGATION_FILES_NUM=20 # Not Used right now 76 | #MAX_FILESIZE=204800000 77 | MAX_FILESIZE=102400000 78 | #MAX_FILESIZE=51200000 79 | #MAX_FILESIZE=10240000 80 | CONFIG_FILE_BLOB_KEY="" 81 | 82 | CONTAINER = "" 83 | ACCOUNT_NAME = "" 84 | SAS_TOKEN = "" 85 | 86 | # 87 | 88 | def main(req: func.HttpRequest) -> func.HttpResponse: 89 | logging.info('Python HTTP trigger function processed a request.') 90 | 91 | metricspath = req.params.get('metricspath') 92 | forceinsert = req.params.get('forceinsert') 93 | 94 | 95 | if not metricspath: 96 | try: 97 | req_body = req.get_json() 98 | except ValueError: 99 | pass 100 | else: 101 | metricspath = req_body.get('metricspath') 102 | forceinsert = req_body.get('forceinsert') 103 | 104 | if forceinsert is None: 105 | forceinsert='' 106 | 107 | if metricspath: 108 | scode=400 109 | msg='Something Wrong, this is Default Message' 110 | #print (metricspath) 111 | try: 112 | scode, msg=process(metricspath,forceinsert ) 113 | except Exception as e: 114 | error_class = e.__class__.__name__ 115 | detail = e.args[0] 116 | cl, exc, tb = sys.exc_info() 117 | lastCallStack = traceback.extract_tb(tb)[-1] 118 | fileName = lastCallStack[0] 119 | lineNum = lastCallStack[1] 120 | funcName = lastCallStack[2] 121 | errMsg = "File \"{}\", line {}, in {}: [{}] {}".format(fileName, lineNum, funcName, error_class, detail) 122 | 123 | print("Unexpected error:", sys.exc_info()[0]) 124 | traceback.print_exc() 125 | 126 | msg= errMsg+traceback.format_exc() 127 | 128 | tc = TelemetryClient('') 129 | tc.context.application.ver = '1.0' 130 | tc.context.properties["PROCESS_PROGRAM"]="BATCH_CSV_V001a" 131 | tc.context.properties["DATA_FOLDER"]=metricspath 132 | tc.track_trace(msg) 133 | 134 | tc.flush() 135 | scode=500 136 | 137 | return func.HttpResponse(f"Proccessed {metricspath}! "+msg, status_code=scode) 138 | 139 | 140 | else: 141 | return func.HttpResponse( 142 | "Please pass a metricspath on the query string or in the request body", 143 | status_code=400 144 | ) 145 | 146 | def process(filesrootfolder,forceinsert ): 147 | 148 | # Create process id as identify of this process 149 | process_id=time.time() 150 | 151 | tc = TelemetryClient('') 152 | 153 | tc.context.application.ver = '1.0' 154 | tc.context.properties["PROCESS_PROGRAM"]="BATCH_CSV_V001a" 155 | tc.context.properties["PROCESS_START"]=time.time() 156 | tc.context.properties["DATA_FOLDER"]=filesrootfolder 157 | tc.context.properties["PROCESS_ID"]=process_id 158 | 159 | tc.track_trace('STRAT RUN BATHCH INGEST CSV DATA from folder '+filesrootfolder) 160 | tc.track_event('BATHCH_INGEST_CSV_START', { 'PROCESS_ID': process_id,'DATA_FOLDER': filesrootfolder }, { }) 161 | tc.flush() 162 | 163 | tc.flush() 164 | #print (vm_uuid,deploy_uuid,config_uuid) 165 | 166 | 167 | # Prepare COSMOS Link 168 | 169 | url=COSMOS_URL 170 | #key = os.environ['ACCOUNT_KEY'] 171 | key=COSMOS_KEY 172 | client = cosmos_client.CosmosClient(url, {'masterKey': key}) 173 | database_id=COSMOS_DATABASE 174 | container_id=COSMOS_CONTAINER 175 | 176 | database_link = 'dbs/' + database_id 177 | collection_link = database_link + '/colls/' + container_id 178 | 179 | 180 | doc_id=vm_uuid+'_'+config_uuid+'_'+deploy_uuid+'_Metric' 181 | doc_link = collection_link + '/docs/' + doc_id 182 | 183 | options = {} 184 | options['enableCrossPartitionQuery'] = True 185 | options['maxItemCount'] = 5 186 | options['partitionKey'] = vm_uuid 187 | 188 | proc_log_doc=None 189 | try: 190 | proc_log_doc = client.ReadItem(doc_link,options) 191 | except: 192 | print("New Process Metric Doc") 193 | 194 | if(proc_log_doc is not None): 195 | print ("Find Existing Metric Doc ") 196 | 197 | if str(forceinsert).lower()!='true': # Stop Proccess if data is already been proccessed 198 | return 400, doc_id+" is already been processed" 199 | 200 | else: # New process log 201 | proc_log_doc={} 202 | proc_log_doc["PROCESSES"]=[] 203 | proc_log_doc["DOC_TYPE"]="PROCESS_METRIC" 204 | proc_log_doc["PROCESS_PROGRAM"]="BATCH_METRIC_CSV_V001a" 205 | proc_log_doc['id']=doc_id 206 | 207 | tc.track_event('BATHCH_INGEST_METRIC_CSV', { 'PROCESS_ID': process_id }, { 'DATA_FOLDER': filesrootfolder }) 208 | #+'_'+config_uuid+'_'+deploy_uuid , { 'DATA_FOLDER': telemetriespath } 209 | tc.flush() 210 | proc_log_this={} 211 | proc_log_this["PROCESS_PROGRAM"]="BATCH_METRIC_CSV_V001a" 212 | proc_log_this["PROCESS_START"]=time.time() 213 | proc_log_this["DATA_FOLDER"]=filesrootfolder 214 | proc_log_this['id']=vm_uuid+'_'+config_uuid+'_'+deploy_uuid+'_'+str(process_id) 215 | 216 | error_files,merged_files, source_files= merge_rename_core_columns_CSV(vm_uuid,deploy_uuid,config_uuid,'defualt_metrics_csv_001A',0,SOURCE_CSV_CONTAINER,filesrootfolder,FILE_OUTPUT_FOLDER,process_id) 217 | 218 | # ToDo ... 219 | proc_log_this["PROCESS_ID"]=process_id 220 | proc_log_this["ERROR_SOURCE_FILES_COUNT"]=len(error_files) 221 | proc_log_this["SOURCE_FILES_COUNT"]=len(source_files) 222 | 223 | tc.track_metric('BATHCH_INGEST_CSV_ERROR_SOURCE_FILES_COUNT', len(error_files)) 224 | tc.track_metric('BATHCH_INGEST_CSV_ERROR_SOURCE_SOURCE_FILES_COUNT', len(source_files)) 225 | tc.flush() 226 | 227 | # print(str(len(error_files)),' ',str(len(merged_files))) 228 | 229 | proc_log_this["PROCESS_END"]=time.time() 230 | proc_log_this["STATUS"]="OK" 231 | 232 | proc_log_this["STATUS_MESSAGE"]=("It takes %s seconds to ingest CSV file from Blob Storage") % (proc_log_this["PROCESS_END"] - proc_log_this["PROCESS_START"]) 233 | 234 | proc_log_doc["PROCESSES"].append(proc_log_this) 235 | proc_log_doc['LATEST_UPDATE_TIMESTAMP']=time.time() 236 | 237 | # Update Process Log 238 | client.UpsertItem(collection_link,proc_log_doc,options) 239 | 240 | tc.track_trace('END RUN BATHCH INGEST METRIC CSV DATA from folder '+filesrootfolder) 241 | 242 | tc.track_event('BATHCH_INGEST_METRIC_CSV_END', { 'PROCESS_ID': process_id,'DATA_FOLDER': filesrootfolder }, { 'DEFECT_FILES_COUNT':len(error_files),'MERGED_FILES_COUNT':len(merged_files),'SOURCE_FILES_COUNT':len(source_files)}) 243 | tc.flush() 244 | 245 | 246 | 247 | def get_uuids(filesrootfolder): 248 | 249 | #if filesrootfolder.endswith('/'): 250 | # filesrootfolder=filesrootfolder[0:len(filesrootfolder)-1] 251 | if not filesrootfolder.endswith('/'): 252 | filesrootfolder=filesrootfolder+'/' 253 | 254 | config_file_name="" 255 | config_file_path=filesrootfolder 256 | config_file_name_end=config_file_path[len(config_file_path)-36:len(config_file_path)-1] 257 | print(config_file_name_end) 258 | 259 | #configfile_block_blob_service = BlockBlobService(account_name=CONFIG_FILE_BLOB_ACCOUNT, sas_token=CONFIG_FILE_TOKEN) 260 | configfile_block_blob_service = BlockBlobService(account_name=CONFIG_FILE_BLOB_ACCOUNT, account_key=CONFIG_FILE_BLOB_KEY) 261 | blobs=configfile_block_blob_service.list_blobs(CONFIG_FILE_CONTAINER, prefix=config_file_path, delimiter="/") 262 | #print (len(list(blobs))) 263 | blobpaths=[] 264 | for blob in blobs: 265 | #print(blob.name) 266 | if(blob.name.lower().endswith((config_file_name_end+'.json').lower())): 267 | config_file_name=blob.name 268 | print(blob.name) 269 | 270 | if(len(config_file_name)>10): 271 | #print (config_file_name) 272 | configstring = configfile_block_blob_service.get_blob_to_text(CONFIG_FILE_CONTAINER, config_file_name, encoding='utf_16_le').content 273 | 274 | #print (configstring) 275 | 276 | decoded_data=codecs.decode(configstring.encode(), 'utf-8-sig') 277 | config_obj=json.loads(decoded_data) 278 | 279 | 280 | print("done") 281 | return vm_uuid,deploy_uuid,config_uuid 282 | 283 | def insert_json_cosmos(jsondoc): 284 | url=COSMOS_URL 285 | #key = os.environ['ACCOUNT_KEY'] 286 | key=COSMOS_KEY 287 | client = cosmos_client.CosmosClient(url, {'masterKey': key}) 288 | database_id=COSMOS_DATABASE 289 | container_id=COSMOS_CONTAINER 290 | client.CreateItem("dbs/" + database_id + "/colls/" + container_id, jsondoc) 291 | 292 | def merge_rename_core_columns_CSV(vm_uuid,deploy_uuid,config_uuid,schema_ver,inject_ver,container_name,filesrootfolder,fileoutputfolder, process_id): 293 | #block_blob_service = BlockBlobService(account_name=SOURCE_CSV_BLOB_ACCOUNT, sas_token=SOURCE_CSV_BLOB_TOKEN) 294 | block_blob_service = BlockBlobService(account_name=SOURCE_CSV_BLOB_ACCOUNT, account_key=SOURCE_CSV_BLOB_KEY) 295 | tc = TelemetryClient('') 296 | print("Start merge CSV ", vm_uuid,' ',deploy_uuid,' ',config_uuid) 297 | 298 | blobs = [] 299 | marker = None 300 | while True: 301 | batch = block_blob_service.list_blobs(container_name, prefix=filesrootfolder) 302 | blobs.extend(batch) 303 | if not batch.next_marker: 304 | break 305 | marker = batch.next_marker 306 | i=0 307 | blobpaths=[] 308 | for blob in blobs: 309 | blobpaths.append(blob.name) 310 | 311 | matchers = ['.csv'] 312 | matching = [s for s in blobpaths if any(xs in s for xs in matchers)] 313 | 314 | 315 | mergelog={} 316 | mergelog["vm_uuid"]=vm_uuid 317 | 318 | mergelog["process_type"]="MERGE_METRIC_CSV" 319 | mergelog["DOC_TYPE"]="MERGE_METRIC_FILES_LOG" 320 | mergelog["file_folder"]=filesrootfolder 321 | mergelog["process_time"]=time.time() 322 | mergelog["files"]=[] 323 | mergelog["defect_files"]=[] 324 | 325 | a_mergelog=copy.deepcopy(mergelog) 326 | 327 | 328 | dfagg = pd.DataFrame(columns=[]) 329 | 330 | mixagg=AGGREGATION_FILES_NUM 331 | aggcount=0 332 | aggcount_total=0 333 | aggoutcount=0 334 | aggsize=0 335 | 336 | error_files=[] 337 | merged_files=[] 338 | totoal_rows=0 339 | alldfs=[] 340 | outfilenamebase=fileoutputfolder+filesrootfolder+"_aggr_" 341 | t1=time.time() 342 | #print (outfilenamebase) 343 | source_col=[''] 344 | target_col=[''] 345 | 346 | tc.track_trace('Prepare to process '+str(len(matching))+' Metric CSV files ') 347 | tc.flush() 348 | 349 | for fname in matching: 350 | #print(aggcount) 351 | 352 | head, tail = os.path.split(fname) 353 | 354 | aggcount+=1 355 | aggcount_total+=1 356 | 357 | blobstring = block_blob_service.get_blob_to_text(container_name, fname).content 358 | aggsize+=len(blobstring) 359 | 360 | #print('Prepare to merge '+str(aggcount_total)+' / '+str(len(matching)) +' Memeory '+str(aggsize)+' File Name: '+tail) 361 | #tc.track_trace('Prepare to merge '+tail) 362 | #tc.flush() 363 | 364 | 365 | try: # Rread CSV And Try Processing 366 | 367 | dfone = pd.read_csv(StringIO(blobstring)) 368 | 369 | dfAll_cols=dfone.columns 370 | #colname0=dfAll_cols 371 | dfAll_newcols=[] 372 | 373 | pc_name=re.search(r'(\\{2}.*\\)(.*\\)', dfAll_cols[1]).group(1) 374 | 375 | for col in dfAll_cols: 376 | dfAll_newcols.append(col.replace(pc_name, '').replace('`', '').replace('\\','').replace(' ','').replace('/','').replace('.','').replace('-','').replace('%','').replace('(','').replace(')','')) 377 | 378 | dfAll_newcols[0]="Universal_datetime" 379 | 380 | # Rename all columns 381 | dfone.columns=dfAll_newcols 382 | 383 | alldfs.append(dfone) 384 | a_mergelog['files'].append(tail) 385 | 386 | #if (aggcount>=mixagg) or (aggcount_total==len(matching)): 387 | if (aggsize>MAX_FILESIZE) or (aggcount_total==len(matching)): 388 | if(aggcount_total==len(matching)): 389 | print("Processing Final File") 390 | tc.track_trace('Processing Final File') 391 | tc.flush() 392 | 393 | alldfs.append(pd.DataFrame(columns=source_col)) 394 | dfagg=pd.concat(alldfs, ignore_index=True) 395 | dfagg_out=dfagg[source_col] 396 | dfagg_out.columns=target_col 397 | dfagg_out['schema_ver']=schema_ver 398 | dfagg_out['inject_ver']=inject_ver 399 | output = dfagg_out.to_csv (index=False, encoding = "utf-8") 400 | outfile=outfilenamebase+str(aggoutcount)+".csv" 401 | block_blob_service.create_blob_from_text(container_name,outfile, output) 402 | print ("Output aggregated file to "+container_name, outfile+" Data Shape "+ str(dfagg.shape)+' uuid: '+str(vm_uuid)+str(deploy_uuid)+ str(config_uuid)) 403 | totoal_rows+=dfagg_out.shape[0] 404 | 405 | merged_files.append(outfile) 406 | 407 | 408 | a_mergelog['output_file']=outfile 409 | a_mergelog['merged_files_num']=len(a_mergelog['files']) 410 | a_mergelog['defect_files_num']=len(a_mergelog['defect_files']) 411 | 412 | # Insert Process Log to COSMOS DB 413 | insert_json_cosmos(a_mergelog) 414 | a_mergelog=copy.deepcopy(mergelog) 415 | t2=time.time() 416 | 417 | print(("It takes %s seconds to merge "+str(aggcount)+" CSV Metrics") % (t2 - t1)) 418 | aggoutcount+=1 419 | aggcount=0 420 | aggsize=0 421 | alldfs=[] 422 | t1=time.time() 423 | file_size=BlockBlobService.get_blob_properties(block_blob_service,container_name,outfile).properties.content_length 424 | print (outfile+" File Size "+str(file_size)) 425 | 426 | # Ingest to AXX 427 | ingest_to_ADX(outfile,file_size) 428 | except Exception as e : 429 | print ('Error While process '+fname) 430 | error_class = e.__class__.__name__ 431 | detail = e.args[0] 432 | cl, exc, tb = sys.exc_info() 433 | lastCallStack = traceback.extract_tb(tb)[-1] 434 | fileName = lastCallStack[0] 435 | lineNum = lastCallStack[1] 436 | funcName = lastCallStack[2] 437 | errMsg = "File \"{}\", line {}, in {}: [{}] {}".format(fileName, lineNum, funcName, error_class, detail) 438 | 439 | print("Unexpected error:", sys.exc_info()[0]) 440 | traceback.print_exc() 441 | 442 | msg= errMsg+traceback.format_exc() 443 | 444 | tc = TelemetryClient('') 445 | tc.context.application.ver = '1.0' 446 | tc.context.properties["PROCESS_PROGRAM"]="BATCH_METRIC_CSV_V001a" 447 | tc.context.properties["DATA_FOLDER"]=metricspath 448 | tc.track_trace(msg) 449 | 450 | tc.flush() 451 | # print("Unexpected error:", sys.exc_info()[0]) 452 | a_mergelog["defect_files"].append(tail) 453 | error_files.append(fname) # Add No-Well Formed JSON to error file 454 | print('Total Rows '+str(totoal_rows)) 455 | 456 | tc.track_trace('Proccessed Rows: '+str(totoal_rows)) 457 | tc.track_metric('BATHCH_INGEST_METRIC_CSV_TOTAL_ROWS', str(totoal_rows)) 458 | tc.flush() 459 | return error_files,merged_files,matching 460 | 461 | 462 | def ingest_to_ADX(filepath, filesize): 463 | KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(DATA_INGESTION_URI) 464 | KCSB_INGEST.authority_id = AAD_TENANT_ID 465 | 466 | KCSB_ENGINE = KustoConnectionStringBuilder.with_aad_device_authentication(URI) 467 | KCSB_ENGINE.authority_id = AAD_TENANT_ID 468 | 469 | INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) 470 | INGESTION_PROPERTIES = IngestionProperties(database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.CSV, mappingReference=DESTINATION_TABLE_COLUMN_MAPPING, additionalProperties={'ignoreFirstRecord': 'true'}, reportLevel=ReportLevel.FailuresAndSuccesses) 471 | BLOB_PATH = "https://" + SOURCE_CSV_BLOB_ACCOUNT + ".blob.core.windows.net/" + SOURCE_CSV_CONTAINER + "/" + filepath + SOURCE_CSV_BLOB_TOKEN 472 | 473 | BLOB_DESCRIPTOR = BlobDescriptor(BLOB_PATH, filesize) # 10 is the raw size of the data in bytes 474 | INGESTION_CLIENT.ingest_from_blob(BLOB_DESCRIPTOR,ingestion_properties=INGESTION_PROPERTIES) 475 | 476 | print('Done queuing up ingestion with Azure Data Explorer '+filepath) -------------------------------------------------------------------------------- /functions/ADXIngestMonitorFunc/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pandas as pd 4 | import time,datetime 5 | from io import StringIO 6 | from azure.storage.blob import BlockBlobService 7 | from pandas.core.indexes.api import _union_indexes 8 | import json 9 | import codecs 10 | import re 11 | import os,sys 12 | import copy 13 | import io 14 | import zipfile 15 | import uuid 16 | import traceback 17 | import azure.functions as func 18 | 19 | from azure.kusto.data.request import KustoClient, KustoConnectionStringBuilder 20 | from azure.kusto.data.exceptions import KustoServiceError 21 | from azure.kusto.data.helpers import dataframe_from_result_table 22 | from azure.kusto.ingest import KustoIngestClient, IngestionProperties, FileDescriptor, BlobDescriptor, DataFormat, ReportLevel, ReportMethod 23 | from azure.kusto.ingest import ( 24 | BlobDescriptor, 25 | IngestionProperties, 26 | DataFormat, 27 | CsvColumnMapping, 28 | JsonColumnMapping, 29 | ReportLevel, 30 | ReportMethod, 31 | ValidationPolicy, 32 | ValidationOptions, 33 | ValidationImplications, 34 | ) 35 | 36 | from azure.kusto.ingest._resource_manager import _ResourceUri 37 | from azure.kusto.ingest.status import KustoIngestStatusQueues, SuccessMessage, FailureMessage 38 | 39 | import azure.cosmos.cosmos_client as cosmos_client 40 | import azure.cosmos.errors as errors 41 | import azure.cosmos.http_constants as http_constant 42 | 43 | from applicationinsights import TelemetryClient 44 | 45 | from msrest.authentication import TopicCredentials 46 | from azure.eventgrid import EventGridClient 47 | 48 | 49 | # COSMOS CONFIG 50 | COSMOS_URL="https://[COSMOS Server].documents.azure.com:443/" 51 | COSMOS_KEY="" 52 | COSMOS_DATABASE='[COSMOS DATABASE]' 53 | COSMOS_CONTAINER='[]' 54 | 55 | 56 | PROCESSED_TELEMETRY_FOLDER='telemetry_processed' 57 | 58 | # ADX CONFIG 59 | APP_AAD_TENANT_ID = "[ADX Service Priciple TENANT ID]" 60 | APP_CLIENT_ID = '[ADX Service Priciple Client ID]' 61 | APP_CLIENT_SECRETS='[ADX Service Priciple Secrets]' 62 | DATA_INGESTION_URI = "https://[KUSTO Ingestion ].kusto.windows.net:443;Federated Security=True;Application Client Id="+APP_CLIENT_ID+";Application Key="+APP_CLIENT_SECRETS 63 | 64 | SUCCESS_STATUS="SUCCESS" 65 | FAILURE_STATUS="FAILURE" 66 | 67 | APP_INSIGHT_ID="" 68 | APP_INSIGHT_MAIN_ERROR_EVENT_NAME="MONITOR_ADX_ERROR" 69 | APP_INSIGHT_INGEST_SUCCESS_COUNT_NAME="INGEST_SUCCESS_COUNT" 70 | APP_INSIGHT_INGEST_FAILURE_COUNT_NAME="INGEST_FAILURE_COUNT" 71 | APP_INSIGHT_INGEST_SUCCESS_EVENT_NAME="INGEST_SUCCESS" 72 | APP_INSIGHT_INGEST_FAILURE_EVENT_NAME="INGEST_FAILURE" 73 | 74 | LOG_MESSAGE_HEADER="[ADX-MONITOR]" 75 | PROCESS_PROGRAM_NAME="MONITOR_ADX_V001a" 76 | 77 | EVENT_GRID_ENDPOINT="[event grid endpoint].eventgrid.azure.net" 78 | EVENT_GRID_KEY="" 79 | 80 | COSMOS_CLIENT = cosmos_client.CosmosClient(COSMOS_URL, {'masterKey': COSMOS_KEY}) 81 | vm_uuid="" 82 | deploy_uuid="" 83 | config_uuid="" 84 | 85 | def get_file_path(IngestionSourcePath): 86 | blob_basic_urn=".blob.core.windows.net/" 87 | temp_path=IngestionSourcePath[IngestionSourcePath.find(blob_basic_urn)+len(blob_basic_urn):] 88 | #print (temp_path) 89 | file_path=temp_path[temp_path.find('/')+1:] 90 | #print (file_path) 91 | return file_path 92 | 93 | 94 | def get_container_name(IngestionSourcePath): 95 | blob_basic_urn=".blob.core.windows.net/" 96 | temp_path=IngestionSourcePath[IngestionSourcePath.find(blob_basic_urn)+len(blob_basic_urn):] 97 | sec_path=temp_path.split('/') 98 | return sec_path[0] 99 | 100 | def update_COSMOS_status(client,file_path,datetime, status, message,vm_uuid,source_id ,blob_conatiner_name,tc , doc_order,run_id): 101 | 102 | database_id=COSMOS_DATABASE 103 | container_id=COSMOS_CONTAINER 104 | 105 | doc_id=get_doc_id(blob_conatiner_name,file_path) 106 | database_link = 'dbs/' + database_id 107 | collection_link = database_link + '/colls/' + container_id 108 | 109 | doc_link = collection_link + '/docs/' + doc_id 110 | 111 | print('{} doc_link : {}'.format(LOG_MESSAGE_HEADER, doc_link)) 112 | 113 | options = {} 114 | options['enableCrossPartitionQuery'] = True 115 | options['maxItemCount'] = 5 116 | options['partitionKey'] = vm_uuid 117 | 118 | win_telemetry_info=None 119 | try: 120 | win_telemetry_info = client.ReadItem(doc_link,options) 121 | #print(win_telemetry_info) 122 | 123 | except: 124 | tc.track_trace('{} Failed TO FIND DOC {} Ingest Log in COSMOSDB, DOC_ID: {}'.format(LOG_MESSAGE_HEADER,doc_order,doc_id)) 125 | tc.track_event(APP_INSIGHT_MAIN_ERROR_EVENT_NAME, {'MESSAGE': 'Failed TO FIND DOC Ingest Log in COSMOSDB','DOC_ID': doc_id },{ }) 126 | 127 | tc.flush() 128 | print ('{} Failed TO FIND DOC Ingest Log in COSMOSDB, DOC_ID: {}'.format(LOG_MESSAGE_HEADER,doc_id)) 129 | return 130 | 131 | if(win_telemetry_info is not None): 132 | ingest_updated=False 133 | for ingestion_info in win_telemetry_info["ingestions"]: 134 | try: 135 | if ingestion_info["status"]=='PENDING': 136 | if ingestion_info["source_id"]==source_id: 137 | tc.track_trace('{} Found Pending status, DOC_ID: {}, DOC_SOURCE_ID: {}, '.format(LOG_MESSAGE_HEADER,doc_id, source_id)) 138 | ingestion_info["status"]=status 139 | ingestion_info["ingest_finish_time"]=datetime 140 | ingestion_info["message"]=message 141 | 142 | if status==SUCCESS_STATUS: 143 | tc.track_trace('{} SUCCESS TO INGEST TO ADX, RUN_ID: {}, DOC_ORDER [{}], DOC_ID: {}, SOURCE_ID: {}'.format(LOG_MESSAGE_HEADER,run_id, doc_order, doc_id, source_id)) 144 | tc.track_event(APP_INSIGHT_INGEST_SUCCESS_EVENT_NAME, {'MESSAGE': 'SUCCESS TO Ingest ADX','DOC_ID': doc_id, 'SOURCE_ID':source_id },{"Total_RECORDS":ingestion_info["total_records"]}) 145 | 146 | tc.track_metric(APP_INSIGHT_INGEST_SUCCESS_COUNT_NAME, ingestion_info["total_records"]) 147 | tc.flush() 148 | 149 | publish_eventgrid(EVENT_GRID_ENDPOINT,EVENT_GRID_KEY,win_telemetry_info['vm_uuid'], win_telemetry_info['config_uuid'], win_telemetry_info['deploy_uuid'],file_path,ingestion_info['min_datatime'],ingestion_info['max_datatime'], ingestion_info['total_records'], win_telemetry_info['blob_account'], win_telemetry_info['blob_container_name'] ) 150 | 151 | tc.track_trace('{} Publish to Eventgrid, DOC_Order: {} DOC_ID: {}, SOURCE_ID: {}'.format(LOG_MESSAGE_HEADER,doc_order, doc_id, source_id)) 152 | tc.flush() 153 | 154 | elif status==FAILURE_STATUS: 155 | tc.track_trace('{} FAILED TO INGEST TO ADX,DOC_ORDER {}, DOC_ID: {}, SOURCE_ID {} '.format(LOG_MESSAGE_HEADER,doc_order, doc_id,source_id)) 156 | tc.track_metric(APP_INSIGHT_INGEST_FAILURE_COUNT_NAME, ingestion_info["total_records"]) 157 | tc.track_event(APP_INSIGHT_INGEST_FAILURE_EVENT_NAME, {'MESSAGE': 'Failed TO Ingest ADX','DOC_ID': doc_id , 'SOURCE_ID':source_id},{"Total_RECORDS":ingestion_info["total_records"]}) 158 | tc.flush() 159 | 160 | client.UpsertItem(collection_link,win_telemetry_info,options) 161 | ingest_updated=True 162 | break 163 | except Exception: 164 | print(traceback.format_exc()) 165 | print(sys.exc_info()[2]) 166 | 167 | if(not ingest_updated): 168 | tc.track_trace('{} Failed TO FIND status=PENDING Ingest record with COSMOSDB DOC_ID: {}, SOURCE_ID:{}'.format(LOG_MESSAGE_HEADER,doc_id,source_id )) 169 | tc.track_event(APP_INSIGHT_MAIN_ERROR_EVENT_NAME, {'MESSAGE': 'Failed TO FIND status=PENDING Ingest record with COSMOSDB','DOC_ID': doc_id , 'SOURCE_ID':source_id},{ }) 170 | tc.flush() 171 | print('{} Failed TO FIND status=PENDING Ingest record with COSMOSDB DOC_ID: {}'.format(LOG_MESSAGE_HEADER,doc_id)) 172 | 173 | def move_processed_file(blockblobservice, source_conatainer, source_filepath, target_container, target_filepath, tc): 174 | try: 175 | print("{} Start Move Processed Files".format(LOG_MESSAGE_HEADER)) 176 | blob_url = blockblobservice.make_blob_url(source_conatainer, source_filepath) 177 | print("{} Blob URL : {}".format(LOG_MESSAGE_HEADER, blob_url)) 178 | # blob_url:https://demostorage.blob.core.windows.net/image-container/pretty.jpg 179 | 180 | blockblobservice.copy_blob( target_container, target_filepath, blob_url) 181 | #for move the file use this line 182 | blockblobservice.delete_blob(source_conatainer, source_filepath) 183 | except Exception: 184 | print(traceback.format_exc()) 185 | print(sys.exc_info()[2]) 186 | 187 | errormsg="{} Failed TO Clean Ingested File. Source Container: {}, Source File path: {}. Target Container {}, Target File path {}, traceback {}, execute info {}".format(LOG_MESSAGE_HEADER,source_conatainer, source_filepath,target_container, target_filepath,str(traceback.format_exc()),str(sys.exc_info()[2])) 188 | print (errormsg) 189 | 190 | tc.track_trace(errormsg) 191 | tc.track_event(APP_INSIGHT_MAIN_ERROR_EVENT_NAME, {'MESSAGE': errormsg,'Doc Path': source_filepath },{ }) 192 | tc.flush() 193 | 194 | def get_filename(filepath): 195 | path_secs=filepath.split('/') 196 | filename=path_secs[len(path_secs)-1] 197 | return filename 198 | 199 | 200 | def get_doc_id(container_id,file_path): 201 | file_path=file_path[len(file_path)-min(len(file_path),200):] # COSMOSDB Can only acceppet doc id length less than 255 characters 202 | return (container_id+'_'+file_path).replace('/','_').replace('\\','_').replace('?','_').replace('#','_') 203 | 204 | 205 | def get_vm_uuid_from_filename(file_path): 206 | uuid_rex="([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}){1}" 207 | pattern_regex = re.compile(uuid_rex) 208 | result = pattern_regex.findall(file_path) 209 | if len(result)>0: 210 | return result[0] 211 | else: 212 | print ("{} Couldn't find vm_uuid from file path {}".format("", file_path)) 213 | return "COULD_NOT_FIND_VM_UUID" 214 | 215 | 216 | def update_ADX_ingest_status(tc): 217 | 218 | KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(DATA_INGESTION_URI) 219 | KCSB_INGEST.authority_id = APP_AAD_TENANT_ID 220 | INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) 221 | qs = KustoIngestStatusQueues(INGESTION_CLIENT) 222 | 223 | run_id=(str(uuid.uuid4()))[31:].upper() 224 | MAX_BACKOFF = 8 225 | backoff = 1 226 | 227 | total_queue_success_messages=0 228 | while True: 229 | ################### NOTICE #################### 230 | # in order to get success status updates, 231 | # make sure ingestion properties set the 232 | # reportLevel=ReportLevel.FailuresAndSuccesses. 233 | if qs.success.is_empty() and qs.failure.is_empty(): 234 | time.sleep(backoff) 235 | 236 | if backoff==1 and total_queue_success_messages!=0: 237 | print ("{} RUN_ID:{} Processed {} message in this batch ".format(LOG_MESSAGE_HEADER,run_id, total_queue_success_messages)) 238 | 239 | backoff = min(backoff * 2, MAX_BACKOFF) 240 | if(backoff0): 256 | tc.track_trace("{} Get {} success ingest messages ".format(LOG_MESSAGE_HEADER,str(len(success_messages)))) 257 | total_success=len(success_messages) 258 | if failure_messages is not None: 259 | if (len(failure_messages)>0): 260 | tc.track_trace("{} Get {} failure ingest messages ".format(LOG_MESSAGE_HEADER,str(len(failure_messages)))) 261 | total_failure=len(failure_messages) 262 | tc.flush() 263 | total_queue_success_messages+=len(success_messages) 264 | count_success=0 265 | count_faulure=0 266 | for smsg in success_messages: 267 | file_path=get_file_path(smsg.IngestionSourcePath) 268 | container_name=get_container_name(smsg.IngestionSourcePath) 269 | count_success+=1 270 | log_msg="{} SUCCESS TO INGEST TO ADX <{}> -[{}/{}/{}] , Time: {}, vm_uuid: {}, source_id:{}, file path: {}".format(LOG_MESSAGE_HEADER,run_id,str(count_success), str(total_success), str(total_queue_success_messages),smsg.SucceededOn,get_vm_uuid_from_filename(file_path),smsg.IngestionSourceId, file_path) 271 | tc.track_trace(log_msg) 272 | tc.track_event(APP_INSIGHT_INGEST_SUCCESS_EVENT_NAME, {'MESSAGE': 'SUCCESS TO Ingest ADX','file_path': file_path ,'source_id':smsg.IngestionSourceId },{}) 273 | tc.flush() 274 | update_COSMOS_status(COSMOS_CLIENT, file_path,smsg.SucceededOn,SUCCESS_STATUS,str(smsg),get_vm_uuid_from_filename(file_path),smsg.IngestionSourceId,container_name ,tc,count_success ,run_id ) 275 | 276 | telemetry_block_blob_service = BlockBlobService(account_name=SOURCE_TELEMETRY_BLOB_ACCOUNT, account_key=SOURCE_TELEMETRY_FILE_BLOB_KEY) 277 | 278 | target_file_path='' 279 | if (PROCESSED_TELEMETRY_FOLDER.endswith('/')): 280 | target_file_path=PROCESSED_TELEMETRY_FOLDER+file_path 281 | else : 282 | target_file_path=PROCESSED_TELEMETRY_FOLDER+'/'+file_path 283 | 284 | move_processed_file(telemetry_block_blob_service,container_name,file_path,container_name,target_file_path,tc) 285 | tc.track_trace('{} DONE ADX INGESTION PROCESS <{}> -[{}/{}/{}], File Moved to processed folder {} , vm_uuid: {}, file path: {}'.format(LOG_MESSAGE_HEADER,run_id,str(count_success), str(total_success), str(total_queue_success_messages),target_file_path,get_vm_uuid_from_filename(file_path),file_path)) 286 | tc.track_event(APP_INSIGHT_INGEST_SUCCESS_EVENT_NAME, {'MESSAGE': 'DONE ADX INGESTION PROCESS','moved_file_path':target_file_path,'source_file_path':file_path },{}) 287 | tc.flush() 288 | #smsgjson=json.loads(smsg) 289 | #print (smsgjson['IngestionSourcePath']) 290 | #print (smsgjson['SucceededOn']) 291 | print ("{} IngestionSourcePath: {}".format(LOG_MESSAGE_HEADER,smsg.IngestionSourcePath)) 292 | print (smsg.SucceededOn) 293 | for fmsg in failure_messages: 294 | container_name=get_container_name(fmsg.IngestionSourcePath) 295 | file_path=get_file_path(fmsg.IngestionSourcePath) 296 | count_faulure+=1 297 | log_msg="{} FAILED TO INGEST TO ADX <{}> -[{}/{}] , Time: {}, vm_uuid: {}, source_id:{}, container:{}, file path: {}, message: {}".format(LOG_MESSAGE_HEADER,run_id, str(count_faulure), str(total_failure),fmsg.FailedOn,get_vm_uuid_from_filename(file_path),fmsg.IngestionSourceId ,container_name,file_path,str(fmsg)) 298 | tc.track_trace(log_msg) 299 | tc.track_event(APP_INSIGHT_INGEST_FAILURE_EVENT_NAME, {'MESSAGE': 'FAILED TO Ingest ADX','file_path': file_path, 'source_id':fmsg.IngestionSourceId },{}) 300 | tc.flush() 301 | update_COSMOS_status(COSMOS_CLIENT,file_path,fmsg.FailedOn,FAILURE_STATUS,str(fmsg),get_vm_uuid_from_filename(file_path),fmsg.IngestionSourceId,container_name,tc , count_faulure,run_id) 302 | 303 | 304 | 305 | def publish_eventgrid(EVENT_GRID_ENDPOINT,EVENT_GRID_KEY,vm_uuid, config_uuid, deploy_uuid,file_path,min_unixtime,max_unixtime, total_records,storage_account,container_name ): 306 | 307 | credentials = TopicCredentials(EVENT_GRID_KEY ) 308 | event_grid_client = EventGridClient(credentials) 309 | event_time=datetime.datetime.fromtimestamp(time.time()) 310 | events_content=[{ 311 | 'id' : str(uuid.uuid1()), 312 | 'subject' : "FINISH INGEST TO ADX", 313 | 'data': { 314 | 'vm_uuid': vm_uuid, 315 | 'config_uuid':config_uuid, 316 | 'deploy_uuid':deploy_uuid, 317 | 'file_path':file_path, 318 | 'min_unixtime':min_unixtime, 319 | 'max_unixtime':max_unixtime, 320 | 'total_records':total_records, 321 | 'storage_account':storage_account, 322 | 'container_name':container_name 323 | 324 | }, 325 | 'event_type': 'INGEST_ADX_FINSIHED', 326 | 'event_time': event_time, 327 | 'data_version': 1 328 | }] 329 | 330 | print ("{} Send to EventGrid with Content: {}".format(LOG_MESSAGE_HEADER,events_content)) 331 | 332 | event_grid_client.publish_events( 333 | EVENT_GRID_ENDPOINT, 334 | events=events_content 335 | ) 336 | 337 | 338 | def get_config_values(): 339 | global COSMOS_URL,COSMOS_KEY,COSMOS_DATABASE,COSMOS_CONTAINER, COSMOS_CLIENT 340 | global SOURCE_TELEMETRY_BLOB_ACCOUNT,SOURCE_TELEMETRY_FILE_BLOB_KEY 341 | global APP_AAD_TENANT_ID,APP_CLIENT_ID,APP_CLIENT_SECRETS,DATA_INGESTION_URI 342 | global APP_INSIGHT_ID,APP_INSIGHT_INGEST_SUCCESS_COUNT_NAME,APP_INSIGHT_INGEST_FAILURE_COUNT_NAME 343 | global APP_INSIGHT_INGEST_SUCCESS_EVENT_NAME,APP_INSIGHT_INGEST_FAILURE_EVENT_NAME 344 | global LOG_MESSAGE_HEADER,PROCESS_PROGRAM_NAME 345 | # COSMOS CONFIG 346 | COSMOS_URL= os.getenv("COSMOS_URL",COSMOS_URL) 347 | COSMOS_KEY= os.getenv("COSMOS_KEY",COSMOS_KEY) 348 | COSMOS_DATABASE=os.getenv("COSMOS_DATABASE",COSMOS_DATABASE) 349 | COSMOS_CONTAINER=os.getenv("COSMOS_CONTAINER",COSMOS_CONTAINER) 350 | 351 | 352 | SOURCE_TELEMETRY_BLOB_ACCOUNT=os.getenv("SOURCE_TELEMETRY_BLOB_ACCOUNT",SOURCE_TELEMETRY_BLOB_ACCOUNT) 353 | SOURCE_TELEMETRY_FILE_BLOB_KEY=os.getenv("SOURCE_TELEMETRY_FILE_BLOB_KEY",SOURCE_TELEMETRY_FILE_BLOB_KEY) 354 | 355 | # ADX CONFIG 356 | APP_AAD_TENANT_ID = os.getenv("APP_AAD_TENANT_ID",APP_AAD_TENANT_ID) 357 | APP_CLIENT_ID = os.getenv("APP_CLIENT_ID",APP_CLIENT_ID) 358 | APP_CLIENT_SECRETS=os.getenv("APP_CLIENT_SECRETS",APP_CLIENT_SECRETS) 359 | #KUSTO_ENGINE_URI =os.environ("KUSTO_ENGINE_URI",KUSTO_ENGINE_URI) 360 | DATA_INGESTION_URI =os.getenv("DATA_INGESTION_URI",DATA_INGESTION_URI) 361 | 362 | APP_INSIGHT_ID=os.getenv("APP_INSIGHT_ID",APP_INSIGHT_ID) 363 | APP_INSIGHT_INGEST_SUCCESS_COUNT_NAME=os.getenv("INGEST_WIN_TELEMETRY_JSON_SUCCESS_COUNT",APP_INSIGHT_INGEST_SUCCESS_COUNT_NAME) 364 | APP_INSIGHT_INGEST_FAILURE_COUNT_NAME=os.getenv("INGEST_WIN_TELEMETRY_JSON_FAILURE_COUNT",APP_INSIGHT_INGEST_FAILURE_COUNT_NAME) 365 | APP_INSIGHT_INGEST_SUCCESS_EVENT_NAME=os.getenv("INGEST_WIN_TELEMETRY_JSON_SUCCESS",APP_INSIGHT_INGEST_SUCCESS_EVENT_NAME) 366 | APP_INSIGHT_INGEST_FAILURE_EVENT_NAME=os.getenv("INGEST_WIN_TELEMETRY_JSON_FAILURE",APP_INSIGHT_INGEST_FAILURE_EVENT_NAME) 367 | 368 | LOG_MESSAGE_HEADER=os.getenv("LOG_MESSAGE_HEADER",LOG_MESSAGE_HEADER) 369 | PROCESS_PROGRAM_NAME=os.getenv("PROCESS_PROGRAM_NAME",PROCESS_PROGRAM_NAME) 370 | COSMOS_CLIENT = cosmos_client.CosmosClient(COSMOS_URL, {'masterKey': COSMOS_KEY}) 371 | 372 | def main(mytimer: func.TimerRequest) -> None: 373 | 374 | get_config_values() 375 | 376 | utc_timestamp = datetime.datetime.utcnow().replace( 377 | tzinfo=datetime.timezone.utc).isoformat() 378 | 379 | if mytimer.past_due: 380 | logging.info('The timer is past due!') 381 | 382 | logging.info('Python timer trigger function ran at %s', utc_timestamp) 383 | 384 | tc = TelemetryClient(APP_INSIGHT_ID) 385 | tc.context.application.ver = '1.0' 386 | tc.context.properties["PROCESS_PROGRAM"]=PROCESS_PROGRAM_NAME 387 | tc.context.properties["PROCESS_START"]=time.time() 388 | tc.track_trace('{} Start to Check ADX Ingest Log'.format(LOG_MESSAGE_HEADER)) 389 | tc.flush() 390 | 391 | update_ADX_ingest_status(tc) 392 | 393 | 394 | --------------------------------------------------------------------------------