├── model ├── __init__.py ├── caller_type.py ├── caller.py └── status.py ├── requirements.txt ├── cache ├── status.json └── status_2.json ├── cron.sh ├── setup.py ├── config.py.example ├── README.md ├── .gitignore ├── uploader.py ├── exchange.py └── downloader.py /model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | qiniu == 7.0.7 2 | -------------------------------------------------------------------------------- /cache/status.json: -------------------------------------------------------------------------------- 1 | {"new_count": 18692, "timestamp": 1500396167, "version": 212, "count": 188947, "md5": "dd8dd132e73f8d90aa8e6be9ffd6ad37"} -------------------------------------------------------------------------------- /cron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo $(date) 4 | 5 | PWD="$(dirname $0)" 6 | 7 | echo "$PWD" 8 | 9 | cd "$PWD" || exit 1 10 | 11 | venv/bin/python exchange.py 12 | -------------------------------------------------------------------------------- /cache/status_2.json: -------------------------------------------------------------------------------- 1 | {"version": 221, "count": 354131, "new_count": 12180, "timestamp": 1597115539, "md5": "2d4d048f1cf629cae57517c53bb59e69", "url": "https://sh.xdty.org:10443/d/db/"} 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup( 4 | name='CallerExchange', 5 | version='', 6 | packages=['model'], 7 | url='', 8 | license='', 9 | author='ty', 10 | author_email='', 11 | description='' 12 | ) 13 | -------------------------------------------------------------------------------- /config.py.example: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | app_id = "APP_ID" 4 | app_key = "MASTER_APP_KEY,master" 5 | qn_access_key = "ACCESS_KEY" 6 | qn_secret_key = "SECRET_KEY" 7 | qn_bucket_name = "BUCKET_NAME" 8 | 9 | cos_app_id = "" 10 | cos_bucket_name = "" 11 | cos_secret_id = "" 12 | cos_key = "" 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CallerExchange 2 | CallerInfo 离线数据管理服务, 服务主要有三个过程 3 | 4 | 1\. 从 leancloud 下载所有用户上报的数据 5 | 6 | 2\. 对数据进行过滤处理,生成离线数据库文件 7 | 8 | 3\. 上传文件到七牛存储 9 | 10 | ## 运行 11 | 12 | ``` 13 | virtualenv -p python3 venv 14 | source venv/bin/activate 15 | pip install -r requirements.txt -I 16 | python exchange.py 17 | ``` 18 | 19 | **crontab** 20 | 21 | ``` 22 | 5 0 * * * /path/to/CallerExchange/cron.sh >> /var/log/exchange.log 2>&1 23 | ``` 24 | -------------------------------------------------------------------------------- /model/caller_type.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | class CallerType: 5 | HARASSMENT = 0 6 | FRAUD = 1 7 | AD = 2 8 | EXPRESS = 3 9 | RESTAURANT = 4 10 | GENERAL = 64 11 | 12 | 13 | def from_name(name): 14 | if '骚扰' in name: 15 | return CallerType.HARASSMENT 16 | if '诈骗' in name or '欺诈' in name: 17 | return CallerType.FRAUD 18 | if '广告' in name or '推销' in name: 19 | return CallerType.AD 20 | if '快递' in name or 'EMS' in name or '顺丰' in name: 21 | return CallerType.EXPRESS 22 | if '送餐' in name or '外卖' in name: 23 | return CallerType.RESTAURANT 24 | return CallerType.GENERAL 25 | -------------------------------------------------------------------------------- /model/caller.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | 4 | from datetime import datetime 5 | from time import mktime 6 | 7 | 8 | class Caller: 9 | """model of caller class for LeanCloud data""" 10 | number = None 11 | name = None 12 | count = 0 13 | type = None 14 | source = None 15 | uid = None 16 | time = None 17 | repeat = 0 18 | 19 | def __init__(self, s): 20 | self.__dict__ = s 21 | self.source = self.__dict__['from'] 22 | self.time = int(mktime(datetime.strptime(self.__dict__['createdAt'], "%Y-%m-%dT%H:%M:%S.%fZ").timetuple())) 23 | if '+86' in self.number: 24 | self.number = self.number.replace('+86', '') 25 | if ' ' in self.number: 26 | self.number = self.number.replace(' ', '') 27 | if ':0' in self.name: 28 | self.name = self.name.replace(':0', '') 29 | 30 | def dump(self): 31 | print(self.number, self.name, self.count, self.type, self.source, self.time) 32 | 33 | def dict(self): 34 | return self.number, self.name, self.count, self.type, self.source, self.time 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/pycharm 3 | 4 | ### PyCharm ### 5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 7 | 8 | .idea/ 9 | # User-specific stuff: 10 | .idea/workspace.xml 11 | .idea/tasks.xml 12 | .idea/dictionaries 13 | .idea/vcs.xml 14 | .idea/jsLibraryMappings.xml 15 | 16 | # Sensitive or high-churn files: 17 | .idea/dataSources.ids 18 | .idea/dataSources.xml 19 | .idea/dataSources.local.xml 20 | .idea/sqlDataSources.xml 21 | .idea/dynamic.xml 22 | .idea/uiDesigner.xml 23 | 24 | # Gradle: 25 | .idea/gradle.xml 26 | .idea/libraries 27 | 28 | # Mongo Explorer plugin: 29 | .idea/mongoSettings.xml 30 | 31 | ## File-based project format: 32 | *.iws 33 | 34 | ## Plugin-specific files: 35 | 36 | # IntelliJ 37 | /out/ 38 | 39 | # mpeltonen/sbt-idea plugin 40 | .idea_modules/ 41 | 42 | # JIRA plugin 43 | atlassian-ide-plugin.xml 44 | 45 | # Crashlytics plugin (for Android Studio and IntelliJ) 46 | com_crashlytics_export_strings.xml 47 | crashlytics.properties 48 | crashlytics-build.properties 49 | fabric.properties 50 | 51 | ### PyCharm Patch ### 52 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 53 | 54 | # *.iml 55 | # modules.xml 56 | __pycache__ 57 | 58 | *.db 59 | *.json 60 | config.py 61 | cache/ 62 | venv/ 63 | data/ 64 | -------------------------------------------------------------------------------- /model/status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import hashlib 3 | import json 4 | import os 5 | import time 6 | 7 | from datetime import datetime 8 | from functools import partial 9 | 10 | data_file = "cache/status_2.json" 11 | 12 | 13 | class Status: 14 | version = 0 15 | count = 0 16 | new_count = 0 17 | timestamp = None 18 | md5 = None 19 | url = 'https://sh.xdty.org:10443/d/db/' 20 | 21 | def __init__(self): 22 | if not os.path.exists(data_file): 23 | return 24 | with open(data_file) as f: 25 | self.__dict__ = json.loads(f.read()) 26 | 27 | def dump(self): 28 | print(self.version, self.timestamp) 29 | 30 | def json(self): 31 | return {"version": self.version, "count": self.count, "new_count": self.new_count, "timestamp": self.timestamp, 32 | 'md5': self.md5, "url": self.url} 33 | 34 | def update(self, file): 35 | self.timestamp = int(time.mktime(datetime.now().utctimetuple())) 36 | self.md5 = md5sum(file) 37 | with open(data_file, "w") as f: 38 | f.write(json.dumps(self.json())) 39 | 40 | def to_list(self): 41 | return [self.version, self.count, self.new_count, self.timestamp] 42 | 43 | def bump(self): 44 | self.version += 1 45 | 46 | 47 | def md5sum(filename): 48 | with open(filename, mode='rb') as f: 49 | d = hashlib.md5() 50 | for buf in iter(partial(f.read, 128), b''): 51 | d.update(buf) 52 | return d.hexdigest() 53 | -------------------------------------------------------------------------------- /uploader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | import qiniu 5 | 6 | import config 7 | 8 | access_key = config.qn_access_key 9 | secret_key = config.qn_secret_key 10 | bucket_name = config.qn_bucket_name 11 | 12 | cos_app_id = config.cos_app_id 13 | cos_bucket_name = config.cos_bucket_name 14 | cos_secret_id = config.cos_secret_id 15 | cos_key = config.cos_key 16 | 17 | 18 | def upload(name): 19 | # upload_file(name) 20 | # upload_cos(name) 21 | pass 22 | 23 | 24 | # upload to qiniu 25 | def upload_file(file_name): 26 | q = qiniu.Auth(access_key, secret_key) 27 | 28 | key = os.path.basename(file_name) 29 | 30 | token = q.upload_token(bucket_name, key) 31 | ret, info = qiniu.put_file(token, key, file_name) 32 | if ret is not None: 33 | print(file_name + ' uploaded.') 34 | else: 35 | print(info) 36 | 37 | 38 | # upload to q-cloud cos 39 | def upload_cos(file): 40 | headers = { 41 | 'Authorization': sign() 42 | } 43 | url = 'https://web.file.myqcloud.com/files/v1/' + cos_app_id + '/' + cos_bucket_name + '/' + os.path.basename(file) 44 | data = {'op': 'upload', 'insertOnly': '0'} 45 | files = {'filecontent': open(file, 'rb')} 46 | import requests 47 | r = requests.post(url, data=data, files=files, headers=headers) 48 | print(r.text) 49 | 50 | 51 | def sign(): 52 | import hmac 53 | import hashlib 54 | 55 | # a=[appid]&b=[bucket]&k=[SecretID]&e=[expiredTime]&t=[currentTime]&r=[rand]&f= 56 | import time 57 | current_time = int(time.time()) 58 | sign_text = 'a=' + cos_app_id + '&b=' + cos_bucket_name + '&k=' + cos_secret_id + '&e=' + str( 59 | current_time + 3600) + '&t=' + str(current_time) + '&r=123&f=' 60 | sign_tmp = hmac.new(cos_key.encode(), sign_text.encode(), hashlib.sha1).digest() + sign_text.encode() 61 | import base64 62 | 63 | return base64.b64encode(sign_tmp).decode() 64 | 65 | -------------------------------------------------------------------------------- /exchange.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import operator 4 | import os 5 | 6 | import re 7 | import zipfile 8 | 9 | import downloader 10 | import uploader 11 | from model import caller_type 12 | from model.caller import Caller 13 | import sqlite3 14 | 15 | from model.status import Status, data_file 16 | 17 | 18 | def compress(file_name): 19 | zip_file = file_name + ".zip" 20 | zf = zipfile.ZipFile(zip_file, "w", zipfile.ZIP_DEFLATED) 21 | zf.write(file_name, arcname=os.path.basename(file_name)) 22 | zf.close() 23 | return zip_file 24 | 25 | 26 | status = Status() 27 | 28 | # 1. download offline file from LeanCloud 29 | 30 | # result_json = downloader.run() 31 | 32 | result_json = 'cache/caller.json' 33 | 34 | if result_json == 'error': 35 | print("Download error!") 36 | exit(-1) 37 | 38 | # 2. read file to caller model map 39 | 40 | caller_map = {} # number:[caller] 41 | 42 | # read appeal 43 | appeal = [] 44 | with open("cache/appeal.json") as f: 45 | data = json.load(f)["results"] 46 | for item in data: 47 | appeal.append(item["number"]) 48 | 49 | with open(result_json) as f: 50 | data = json.load(f) 51 | for item in data["results"]: 52 | caller = Caller(item) 53 | 54 | # filter appeal 55 | if caller.number in appeal: 56 | print("number in appeal: " + caller.number) 57 | continue 58 | 59 | # filter wrong number 60 | if not re.match("^[\d\+]*$", caller.number): 61 | continue 62 | 63 | # filter wrong tagged number 64 | if caller.type < 0 or caller.type > 16 or caller.count == 10000 or caller.count < 0: 65 | continue 66 | 67 | # add to caller map 68 | if caller.number not in caller_map.keys(): 69 | caller_map[caller.number] = [] 70 | 71 | # filter repeated number 72 | append = True 73 | for i in range(0, len(caller_map[caller.number])): 74 | c = caller_map[caller.number][i] 75 | 76 | if c.name == caller.name and c.type == caller.type and c.source == caller.source: 77 | 78 | if caller.count == 0: 79 | caller_map[caller.number][i].count += 1 80 | elif caller.count > caller_map[caller.number][i].count: 81 | caller_map[caller.number][i].count = caller.count 82 | caller_map[caller.number][i].repeat += 1 83 | append = False 84 | break 85 | if append: 86 | caller_map[caller.number].append(caller) 87 | 88 | # 3. resort caller list from map 89 | 90 | caller_list = [] 91 | 92 | for number in caller_map: 93 | c_list = caller_map[number] 94 | count = 0 95 | repeat = 0 96 | target = c_list[0] 97 | source = 8 98 | 99 | # find max count from baidu, 360 or sogou 100 | for caller in c_list: 101 | if caller.repeat > repeat: 102 | target = caller 103 | count = caller.count 104 | repeat = caller.repeat 105 | if 0 <= caller.source <= 2: 106 | source = caller.source 107 | # set caller type 108 | name = caller.name 109 | target.type = caller_type.from_name(name) 110 | 111 | # find max type count from user marked 112 | if count == 0 and source == 8 and len(c_list) > 2: 113 | counts = dict() 114 | for caller in c_list: 115 | t = caller.type 116 | counts[t] = counts.get(t, 0) + 1 117 | t = max(counts.items(), key=operator.itemgetter(1))[0] 118 | for caller in c_list: 119 | if caller.type == t: 120 | caller.count = counts[t] 121 | target = caller 122 | break 123 | 124 | caller_list.append(target.dict()) 125 | 126 | # 4. write to database file 127 | status.new_count = len(caller_list) - status.count 128 | 129 | if status.new_count == 0: 130 | print("No new data.") 131 | exit(0) 132 | 133 | status.count = len(caller_list) 134 | status.bump() 135 | 136 | conn = sqlite3.connect('cache/caller_' + str(status.version) + '.db') 137 | cur = conn.cursor() 138 | cur.execute('''CREATE TABLE IF NOT EXISTS caller 139 | ( id INTEGER PRIMARY KEY AUTOINCREMENT, number TEXT UNIQUE, name TEXT, count INTEGER, type INTEGER, source INTEGER, 140 | time INTEGER );''') 141 | # for caller in caller_list: 142 | # print(caller) 143 | # pass 144 | cur.executemany('insert into caller (number, name, count, type, source, time) values (?, ?, ?, ?, ?, ?)', caller_list) 145 | 146 | cur.execute('''CREATE TABLE IF NOT EXISTS status 147 | ( id INTEGER PRIMARY KEY AUTOINCREMENT, version INTEGER, count INTEGER, new_count INTEGER, time INTEGER );''') 148 | 149 | cur.execute('insert into status (version, count, new_count, time) values (?, ?, ?, ?)', status.to_list()) 150 | 151 | conn.commit() 152 | cur.close() 153 | conn.close() 154 | 155 | # 5. upload offline database to QiNiu 156 | 157 | zip_file = compress('cache/caller_' + str(status.version) + '.db') 158 | status.update(zip_file) 159 | 160 | # upload files 161 | # uploader.upload(zip_file) 162 | uploader.upload(data_file) 163 | -------------------------------------------------------------------------------- /downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import os 5 | import gzip 6 | import urllib.request 7 | 8 | import time 9 | from urllib.error import HTTPError 10 | 11 | import config 12 | 13 | job_url = 'https://leancloud.cn/1.1/bigquery/job' 14 | 15 | job_list = [ 16 | {"appId": config.app_id, "jobConfig": {"sql": 'select * from caller WHERE createdAt <= ' 17 | '"2016-08-01 00:00:00.000" ORDER BY createdAt'}}, 18 | {"appId": config.app_id, "jobConfig": {"sql": 'select * from caller WHERE createdAt >= ' 19 | '"2016-08-01 00:00:00.000" AND createdAt <=' 20 | '"2017-01-01 00:00:00.000" ORDER BY createdAt'}}, 21 | {"appId": config.app_id, "jobConfig": {"sql": 'select * from caller WHERE createdAt >= ' 22 | '"2017-01-01 00:00:00.000" AND createdAt <=' 23 | '"2017-04-01 00:00:00.000" ORDER BY createdAt'}} 24 | ] 25 | 26 | cache_dir = 'cache/' 27 | 28 | headers = { 29 | "X-LC-Id": config.app_id, 30 | "X-LC-Key": config.app_key, 31 | "Content-Type": "application/json" 32 | } 33 | 34 | 35 | def run(): 36 | caches = [] 37 | for job_params in job_list: 38 | cache_file = run_once(job_params) 39 | print(cache_file) 40 | if cache_file == 'error': 41 | return 'error' 42 | caches.append(cache_file) 43 | 44 | # combine cache files to single one. 45 | import hashlib 46 | sha1 = hashlib.sha1() 47 | sha1.update(str(time.time()).encode('utf-8')) 48 | res = cache_dir + sha1.hexdigest() 49 | 50 | with open(res, 'w') as cache: 51 | for f in caches: 52 | with open(f) as f_cache: 53 | for line in f_cache: 54 | cache.write(line) 55 | return res 56 | 57 | 58 | def run_once(job_params): 59 | job = run_job(job_params) 60 | check_status(job.id) 61 | path = export(job.id) 62 | while True: 63 | try: 64 | dir_name, file_name = download(path) 65 | return extract(dir_name, file_name) 66 | except OSError: 67 | print('error extract file, try again.') 68 | time.sleep(10) 69 | continue 70 | 71 | 72 | def run_job(job_params): 73 | data = json.dumps(job_params).encode('utf8') 74 | 75 | req = urllib.request.Request(job_url, data=data, headers=headers) 76 | res = urllib.request.urlopen(req) 77 | job_data = Job(res.read().decode('utf8')) 78 | return job_data 79 | 80 | 81 | def check_status(job_id): 82 | url = job_url + '/' + job_id + '?anchor=0&limit=1' 83 | 84 | while True: 85 | req = urllib.request.Request(url, headers=headers) 86 | res = urllib.request.urlopen(req) 87 | job_status = JobStatus(res.read().decode('utf8')) 88 | 89 | if job_status.status == 'RUNNING': 90 | print('RUNNING') 91 | time.sleep(3) 92 | elif job_status.status == 'OK': 93 | break 94 | 95 | 96 | def export(job_id): 97 | url = 'https://leancloud.cn/1.1/bigquery/job/' + job_id + '/export' 98 | req = urllib.request.Request(url, data=''.encode('utf8'), headers=headers) 99 | res = urllib.request.urlopen(req) 100 | job_export = JobExport(res.read().decode('utf8')) 101 | 102 | if job_export.status == 'OK': 103 | return job_export.path 104 | else: 105 | print('Error, export failed.') 106 | return None 107 | 108 | 109 | def download(url): 110 | res = None 111 | 112 | # download file and keep loop if server returned 404 error 113 | while True: 114 | try: 115 | res = urllib.request.urlopen(url) 116 | break 117 | except HTTPError: 118 | print('download error, try again.') 119 | time.sleep(3) 120 | continue 121 | 122 | dir_name = url.split('/')[-2] + '/' 123 | file_name = url.split('/')[-1] 124 | 125 | if not os.path.exists(cache_dir + dir_name): 126 | os.makedirs(cache_dir + dir_name) 127 | with open(cache_dir + dir_name + file_name, 'b+w') as f: 128 | f.write(res.read()) 129 | return dir_name, file_name 130 | 131 | 132 | def extract(dir_name, file_name): 133 | gz = gzip.open(cache_dir + dir_name + file_name, "rb") 134 | json_file = cache_dir + dir_name + get_filename(file_name) 135 | jf = open(json_file, 'wb') 136 | jf.write(gz.read()) 137 | gz.close() 138 | gz.close() 139 | 140 | if not os.path.exists(json_file): 141 | return 'error' 142 | else: 143 | return json_file 144 | 145 | 146 | def get_filename(path): 147 | filename = path.split('/')[-1].split('.')[0] 148 | return filename 149 | 150 | 151 | class Job: 152 | id = None 153 | appId = None 154 | 155 | def __init__(self, s): 156 | self.__dict__ = json.loads(s) 157 | 158 | def dump(self): 159 | print(self.id, self.appId) 160 | 161 | 162 | class JobStatus: 163 | id = None 164 | status = None 165 | 166 | def __init__(self, s): 167 | self.__dict__ = json.loads(s) 168 | 169 | def dump(self): 170 | print(self.id, self.status) 171 | 172 | 173 | class JobExport: 174 | status = None 175 | path = None 176 | 177 | def __init__(self, s): 178 | self.__dict__ = json.loads(s) 179 | 180 | def dump(self): 181 | print(self.status, self.path) --------------------------------------------------------------------------------