├── model
    ├── __init__.py
    ├── caller_type.py
    ├── caller.py
    └── status.py
├── requirements.txt
├── cache
    ├── status.json
    └── status_2.json
├── cron.sh
├── setup.py
├── config.py.example
├── README.md
├── .gitignore
├── uploader.py
├── exchange.py
└── downloader.py


/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | qiniu == 7.0.7
2 | 


--------------------------------------------------------------------------------
/cache/status.json:
--------------------------------------------------------------------------------
1 | {"new_count": 18692, "timestamp": 1500396167, "version": 212, "count": 188947, "md5": "dd8dd132e73f8d90aa8e6be9ffd6ad37"}


--------------------------------------------------------------------------------
/cron.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo $(date)
 4 | 
 5 | PWD="$(dirname $0)"
 6 | 
 7 | echo "$PWD"
 8 | 
 9 | cd "$PWD" || exit 1
10 | 
11 | venv/bin/python exchange.py
12 | 


--------------------------------------------------------------------------------
/cache/status_2.json:
--------------------------------------------------------------------------------
1 | {"version": 221, "count": 354131, "new_count": 12180, "timestamp": 1597115539, "md5": "2d4d048f1cf629cae57517c53bb59e69", "url": "https://sh.xdty.org:10443/d/db/"}
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(
 4 |     name='CallerExchange',
 5 |     version='',
 6 |     packages=['model'],
 7 |     url='',
 8 |     license='',
 9 |     author='ty',
10 |     author_email='',
11 |     description=''
12 | )
13 | 


--------------------------------------------------------------------------------
/config.py.example:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | app_id = "APP_ID"
 4 | app_key = "MASTER_APP_KEY,master"
 5 | qn_access_key = "ACCESS_KEY"
 6 | qn_secret_key = "SECRET_KEY"
 7 | qn_bucket_name = "BUCKET_NAME"
 8 | 
 9 | cos_app_id = ""
10 | cos_bucket_name = ""
11 | cos_secret_id = ""
12 | cos_key = ""
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CallerExchange
 2 | CallerInfo 离线数据管理服务, 服务主要有三个过程
 3 | 
 4 | 1\. 从 leancloud 下载所有用户上报的数据
 5 | 
 6 | 2\. 对数据进行过滤处理，生成离线数据库文件
 7 | 
 8 | 3\. 上传文件到七牛存储
 9 | 
10 | ## 运行
11 | 
12 | ```
13 | virtualenv -p python3 venv
14 | source venv/bin/activate
15 | pip install -r requirements.txt -I
16 | python exchange.py
17 | ```
18 | 
19 | **crontab**
20 | 
21 | ```
22 | 5 0 * * * /path/to/CallerExchange/cron.sh >> /var/log/exchange.log 2>&1
23 | ```
24 | 


--------------------------------------------------------------------------------
/model/caller_type.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | 
 4 | class CallerType:
 5 |     HARASSMENT = 0
 6 |     FRAUD = 1
 7 |     AD = 2
 8 |     EXPRESS = 3
 9 |     RESTAURANT = 4
10 |     GENERAL = 64
11 | 
12 | 
13 | def from_name(name):
14 |     if '骚扰' in name:
15 |         return CallerType.HARASSMENT
16 |     if '诈骗' in name or '欺诈' in name:
17 |         return CallerType.FRAUD
18 |     if '广告' in name or '推销' in name:
19 |         return CallerType.AD
20 |     if '快递' in name or 'EMS' in name or '顺丰' in name:
21 |         return CallerType.EXPRESS
22 |     if '送餐' in name or '外卖' in name:
23 |         return CallerType.RESTAURANT
24 |     return CallerType.GENERAL
25 | 


--------------------------------------------------------------------------------
/model/caller.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import json
 3 | 
 4 | from datetime import datetime
 5 | from time import mktime
 6 | 
 7 | 
 8 | class Caller:
 9 |     """model of caller class for LeanCloud data"""
10 |     number = None
11 |     name = None
12 |     count = 0
13 |     type = None
14 |     source = None
15 |     uid = None
16 |     time = None
17 |     repeat = 0
18 | 
19 |     def __init__(self, s):
20 |         self.__dict__ = s
21 |         self.source = self.__dict__['from']
22 |         self.time = int(mktime(datetime.strptime(self.__dict__['createdAt'], "%Y-%m-%dT%H:%M:%S.%fZ").timetuple()))
23 |         if '+86' in self.number:
24 |             self.number = self.number.replace('+86', '')
25 |         if ' ' in self.number:
26 |             self.number = self.number.replace(' ', '')
27 |         if '：0' in self.name:
28 |             self.name = self.name.replace('：0', '')
29 | 
30 |     def dump(self):
31 |         print(self.number, self.name, self.count, self.type, self.source, self.time)
32 | 
33 |     def dict(self):
34 |         return self.number, self.name, self.count, self.type, self.source, self.time
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Created by https://www.gitignore.io/api/pycharm
 3 | 
 4 | ### PyCharm ###
 5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 7 | 
 8 | .idea/
 9 | # User-specific stuff:
10 | .idea/workspace.xml
11 | .idea/tasks.xml
12 | .idea/dictionaries
13 | .idea/vcs.xml
14 | .idea/jsLibraryMappings.xml
15 | 
16 | # Sensitive or high-churn files:
17 | .idea/dataSources.ids
18 | .idea/dataSources.xml
19 | .idea/dataSources.local.xml
20 | .idea/sqlDataSources.xml
21 | .idea/dynamic.xml
22 | .idea/uiDesigner.xml
23 | 
24 | # Gradle:
25 | .idea/gradle.xml
26 | .idea/libraries
27 | 
28 | # Mongo Explorer plugin:
29 | .idea/mongoSettings.xml
30 | 
31 | ## File-based project format:
32 | *.iws
33 | 
34 | ## Plugin-specific files:
35 | 
36 | # IntelliJ
37 | /out/
38 | 
39 | # mpeltonen/sbt-idea plugin
40 | .idea_modules/
41 | 
42 | # JIRA plugin
43 | atlassian-ide-plugin.xml
44 | 
45 | # Crashlytics plugin (for Android Studio and IntelliJ)
46 | com_crashlytics_export_strings.xml
47 | crashlytics.properties
48 | crashlytics-build.properties
49 | fabric.properties
50 | 
51 | ### PyCharm Patch ###
52 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
53 | 
54 | # *.iml
55 | # modules.xml
56 | __pycache__
57 | 
58 | *.db
59 | *.json
60 | config.py
61 | cache/
62 | venv/
63 | data/
64 | 


--------------------------------------------------------------------------------
/model/status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import hashlib
 3 | import json
 4 | import os
 5 | import time
 6 | 
 7 | from datetime import datetime
 8 | from functools import partial
 9 | 
10 | data_file = "cache/status_2.json"
11 | 
12 | 
13 | class Status:
14 |     version = 0
15 |     count = 0
16 |     new_count = 0
17 |     timestamp = None
18 |     md5 = None
19 |     url = 'https://sh.xdty.org:10443/d/db/'
20 | 
21 |     def __init__(self):
22 |         if not os.path.exists(data_file):
23 |             return
24 |         with open(data_file) as f:
25 |             self.__dict__ = json.loads(f.read())
26 | 
27 |     def dump(self):
28 |         print(self.version, self.timestamp)
29 | 
30 |     def json(self):
31 |         return {"version": self.version, "count": self.count, "new_count": self.new_count, "timestamp": self.timestamp,
32 |                 'md5': self.md5, "url": self.url}
33 | 
34 |     def update(self, file):
35 |         self.timestamp = int(time.mktime(datetime.now().utctimetuple()))
36 |         self.md5 = md5sum(file)
37 |         with open(data_file, "w") as f:
38 |             f.write(json.dumps(self.json()))
39 | 
40 |     def to_list(self):
41 |         return [self.version, self.count, self.new_count, self.timestamp]
42 | 
43 |     def bump(self):
44 |         self.version += 1
45 | 
46 | 
47 | def md5sum(filename):
48 |     with open(filename, mode='rb') as f:
49 |         d = hashlib.md5()
50 |         for buf in iter(partial(f.read, 128), b''):
51 |             d.update(buf)
52 |     return d.hexdigest()
53 | 


--------------------------------------------------------------------------------
/uploader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | import qiniu
 5 | 
 6 | import config
 7 | 
 8 | access_key = config.qn_access_key
 9 | secret_key = config.qn_secret_key
10 | bucket_name = config.qn_bucket_name
11 | 
12 | cos_app_id = config.cos_app_id
13 | cos_bucket_name = config.cos_bucket_name
14 | cos_secret_id = config.cos_secret_id
15 | cos_key = config.cos_key
16 | 
17 | 
18 | def upload(name):
19 |     # upload_file(name)
20 |     # upload_cos(name)
21 |     pass
22 | 
23 | 
24 | # upload to qiniu
25 | def upload_file(file_name):
26 |     q = qiniu.Auth(access_key, secret_key)
27 | 
28 |     key = os.path.basename(file_name)
29 | 
30 |     token = q.upload_token(bucket_name, key)
31 |     ret, info = qiniu.put_file(token, key, file_name)
32 |     if ret is not None:
33 |         print(file_name + ' uploaded.')
34 |     else:
35 |         print(info)
36 | 
37 | 
38 | # upload to q-cloud cos
39 | def upload_cos(file):
40 |     headers = {
41 |         'Authorization': sign()
42 |     }
43 |     url = 'https://web.file.myqcloud.com/files/v1/' + cos_app_id + '/' + cos_bucket_name + '/' + os.path.basename(file)
44 |     data = {'op': 'upload', 'insertOnly': '0'}
45 |     files = {'filecontent': open(file, 'rb')}
46 |     import requests
47 |     r = requests.post(url, data=data, files=files, headers=headers)
48 |     print(r.text)
49 | 
50 | 
51 | def sign():
52 |     import hmac
53 |     import hashlib
54 | 
55 |     # a=[appid]&b=[bucket]&k=[SecretID]&e=[expiredTime]&t=[currentTime]&r=[rand]&f=
56 |     import time
57 |     current_time = int(time.time())
58 |     sign_text = 'a=' + cos_app_id + '&b=' + cos_bucket_name + '&k=' + cos_secret_id + '&e=' + str(
59 |         current_time + 3600) + '&t=' + str(current_time) + '&r=123&f='
60 |     sign_tmp = hmac.new(cos_key.encode(), sign_text.encode(), hashlib.sha1).digest() + sign_text.encode()
61 |     import base64
62 | 
63 |     return base64.b64encode(sign_tmp).decode()
64 | 
65 | 


--------------------------------------------------------------------------------
/exchange.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import json
  3 | import operator
  4 | import os
  5 | 
  6 | import re
  7 | import zipfile
  8 | 
  9 | import downloader
 10 | import uploader
 11 | from model import caller_type
 12 | from model.caller import Caller
 13 | import sqlite3
 14 | 
 15 | from model.status import Status, data_file
 16 | 
 17 | 
 18 | def compress(file_name):
 19 |     zip_file = file_name + ".zip"
 20 |     zf = zipfile.ZipFile(zip_file, "w", zipfile.ZIP_DEFLATED)
 21 |     zf.write(file_name, arcname=os.path.basename(file_name))
 22 |     zf.close()
 23 |     return zip_file
 24 | 
 25 | 
 26 | status = Status()
 27 | 
 28 | # 1. download offline file from LeanCloud
 29 | 
 30 | # result_json = downloader.run()
 31 | 
 32 | result_json = 'cache/caller.json'
 33 | 
 34 | if result_json == 'error':
 35 |     print("Download error!")
 36 |     exit(-1)
 37 | 
 38 | # 2. read file to caller model map
 39 | 
 40 | caller_map = {}  # number:[caller]
 41 | 
 42 | # read appeal
 43 | appeal = []
 44 | with open("cache/appeal.json") as f:
 45 |     data = json.load(f)["results"]
 46 |     for item in data:
 47 |         appeal.append(item["number"])
 48 | 
 49 | with open(result_json) as f:
 50 |     data = json.load(f)
 51 |     for item in data["results"]:
 52 |         caller = Caller(item)
 53 | 
 54 |         # filter appeal
 55 |         if caller.number in appeal:
 56 |             print("number in appeal: " + caller.number)
 57 |             continue
 58 | 
 59 |         # filter wrong number
 60 |         if not re.match("^[\d\+]*$", caller.number):
 61 |             continue
 62 | 
 63 |         # filter wrong tagged number
 64 |         if caller.type < 0 or caller.type > 16 or caller.count == 10000 or caller.count < 0:
 65 |             continue
 66 | 
 67 |         # add to caller map
 68 |         if caller.number not in caller_map.keys():
 69 |             caller_map[caller.number] = []
 70 | 
 71 |         # filter repeated number
 72 |         append = True
 73 |         for i in range(0, len(caller_map[caller.number])):
 74 |             c = caller_map[caller.number][i]
 75 | 
 76 |             if c.name == caller.name and c.type == caller.type and c.source == caller.source:
 77 | 
 78 |                 if caller.count == 0:
 79 |                     caller_map[caller.number][i].count += 1
 80 |                 elif caller.count > caller_map[caller.number][i].count:
 81 |                     caller_map[caller.number][i].count = caller.count
 82 |                 caller_map[caller.number][i].repeat += 1
 83 |                 append = False
 84 |                 break
 85 |         if append:
 86 |             caller_map[caller.number].append(caller)
 87 | 
 88 | # 3. resort caller list from map
 89 | 
 90 | caller_list = []
 91 | 
 92 | for number in caller_map:
 93 |     c_list = caller_map[number]
 94 |     count = 0
 95 |     repeat = 0
 96 |     target = c_list[0]
 97 |     source = 8
 98 | 
 99 |     # find max count from baidu, 360 or sogou
100 |     for caller in c_list:
101 |         if caller.repeat > repeat:
102 |             target = caller
103 |             count = caller.count
104 |             repeat = caller.repeat
105 |         if 0 <= caller.source <= 2:
106 |             source = caller.source
107 |             # set caller type
108 |             name = caller.name
109 |             target.type = caller_type.from_name(name)
110 | 
111 |     # find max type count from user marked
112 |     if count == 0 and source == 8 and len(c_list) > 2:
113 |         counts = dict()
114 |         for caller in c_list:
115 |             t = caller.type
116 |             counts[t] = counts.get(t, 0) + 1
117 |         t = max(counts.items(), key=operator.itemgetter(1))[0]
118 |         for caller in c_list:
119 |             if caller.type == t:
120 |                 caller.count = counts[t]
121 |                 target = caller
122 |                 break
123 | 
124 |     caller_list.append(target.dict())
125 | 
126 | # 4. write to database file
127 | status.new_count = len(caller_list) - status.count
128 | 
129 | if status.new_count == 0:
130 |     print("No new data.")
131 |     exit(0)
132 | 
133 | status.count = len(caller_list)
134 | status.bump()
135 | 
136 | conn = sqlite3.connect('cache/caller_' + str(status.version) + '.db')
137 | cur = conn.cursor()
138 | cur.execute('''CREATE TABLE IF NOT EXISTS caller
139 |     ( id INTEGER PRIMARY KEY AUTOINCREMENT, number TEXT UNIQUE, name TEXT, count INTEGER, type INTEGER, source INTEGER,
140 |     time INTEGER );''')
141 | # for caller in caller_list:
142 | #     print(caller)
143 | #     pass
144 | cur.executemany('insert into caller (number, name, count, type, source, time) values (?, ?, ?, ?, ?, ?)', caller_list)
145 | 
146 | cur.execute('''CREATE TABLE IF NOT EXISTS status
147 |     ( id INTEGER PRIMARY KEY AUTOINCREMENT, version INTEGER, count INTEGER, new_count INTEGER, time INTEGER );''')
148 | 
149 | cur.execute('insert into status (version, count, new_count, time) values (?, ?, ?, ?)', status.to_list())
150 | 
151 | conn.commit()
152 | cur.close()
153 | conn.close()
154 | 
155 | # 5. upload offline database to QiNiu
156 | 
157 | zip_file = compress('cache/caller_' + str(status.version) + '.db')
158 | status.update(zip_file)
159 | 
160 | # upload files
161 | # uploader.upload(zip_file)
162 | uploader.upload(data_file)
163 | 


--------------------------------------------------------------------------------
/downloader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import json
  4 | import os
  5 | import gzip
  6 | import urllib.request
  7 | 
  8 | import time
  9 | from urllib.error import HTTPError
 10 | 
 11 | import config
 12 | 
 13 | job_url = 'https://leancloud.cn/1.1/bigquery/job'
 14 | 
 15 | job_list = [
 16 |     {"appId": config.app_id, "jobConfig": {"sql": 'select * from caller WHERE createdAt <= '
 17 |                                                   '"2016-08-01 00:00:00.000" ORDER BY createdAt'}},
 18 |     {"appId": config.app_id, "jobConfig": {"sql": 'select * from caller WHERE createdAt >= '
 19 |                                                   '"2016-08-01 00:00:00.000" AND createdAt <='
 20 |                                                   '"2017-01-01 00:00:00.000" ORDER BY createdAt'}},
 21 |     {"appId": config.app_id, "jobConfig": {"sql": 'select * from caller WHERE createdAt >= '
 22 |                                                   '"2017-01-01 00:00:00.000" AND createdAt <='
 23 |                                                   '"2017-04-01 00:00:00.000" ORDER BY createdAt'}}
 24 | ]
 25 | 
 26 | cache_dir = 'cache/'
 27 | 
 28 | headers = {
 29 |     "X-LC-Id": config.app_id,
 30 |     "X-LC-Key": config.app_key,
 31 |     "Content-Type": "application/json"
 32 | }
 33 | 
 34 | 
 35 | def run():
 36 |     caches = []
 37 |     for job_params in job_list:
 38 |         cache_file = run_once(job_params)
 39 |         print(cache_file)
 40 |         if cache_file == 'error':
 41 |             return 'error'
 42 |         caches.append(cache_file)
 43 | 
 44 |     # combine cache files to single one.
 45 |     import hashlib
 46 |     sha1 = hashlib.sha1()
 47 |     sha1.update(str(time.time()).encode('utf-8'))
 48 |     res = cache_dir + sha1.hexdigest()
 49 | 
 50 |     with open(res, 'w') as cache:
 51 |         for f in caches:
 52 |             with open(f) as f_cache:
 53 |                 for line in f_cache:
 54 |                     cache.write(line)
 55 |     return res
 56 | 
 57 | 
 58 | def run_once(job_params):
 59 |     job = run_job(job_params)
 60 |     check_status(job.id)
 61 |     path = export(job.id)
 62 |     while True:
 63 |         try:
 64 |             dir_name, file_name = download(path)
 65 |             return extract(dir_name, file_name)
 66 |         except OSError:
 67 |             print('error extract file, try again.')
 68 |             time.sleep(10)
 69 |             continue
 70 | 
 71 | 
 72 | def run_job(job_params):
 73 |     data = json.dumps(job_params).encode('utf8')
 74 | 
 75 |     req = urllib.request.Request(job_url, data=data, headers=headers)
 76 |     res = urllib.request.urlopen(req)
 77 |     job_data = Job(res.read().decode('utf8'))
 78 |     return job_data
 79 | 
 80 | 
 81 | def check_status(job_id):
 82 |     url = job_url + '/' + job_id + '?anchor=0&limit=1'
 83 | 
 84 |     while True:
 85 |         req = urllib.request.Request(url, headers=headers)
 86 |         res = urllib.request.urlopen(req)
 87 |         job_status = JobStatus(res.read().decode('utf8'))
 88 | 
 89 |         if job_status.status == 'RUNNING':
 90 |             print('RUNNING')
 91 |             time.sleep(3)
 92 |         elif job_status.status == 'OK':
 93 |             break
 94 | 
 95 | 
 96 | def export(job_id):
 97 |     url = 'https://leancloud.cn/1.1/bigquery/job/' + job_id + '/export'
 98 |     req = urllib.request.Request(url, data=''.encode('utf8'), headers=headers)
 99 |     res = urllib.request.urlopen(req)
100 |     job_export = JobExport(res.read().decode('utf8'))
101 | 
102 |     if job_export.status == 'OK':
103 |         return job_export.path
104 |     else:
105 |         print('Error, export failed.')
106 |         return None
107 | 
108 | 
109 | def download(url):
110 |     res = None
111 | 
112 |     # download file and keep loop if server returned 404 error
113 |     while True:
114 |         try:
115 |             res = urllib.request.urlopen(url)
116 |             break
117 |         except HTTPError:
118 |             print('download error, try again.')
119 |             time.sleep(3)
120 |             continue
121 | 
122 |     dir_name = url.split('/')[-2] + '/'
123 |     file_name = url.split('/')[-1]
124 | 
125 |     if not os.path.exists(cache_dir + dir_name):
126 |         os.makedirs(cache_dir + dir_name)
127 |     with open(cache_dir + dir_name + file_name, 'b+w') as f:
128 |         f.write(res.read())
129 |     return dir_name, file_name
130 | 
131 | 
132 | def extract(dir_name, file_name):
133 |     gz = gzip.open(cache_dir + dir_name + file_name, "rb")
134 |     json_file = cache_dir + dir_name + get_filename(file_name)
135 |     jf = open(json_file, 'wb')
136 |     jf.write(gz.read())
137 |     gz.close()
138 |     gz.close()
139 | 
140 |     if not os.path.exists(json_file):
141 |         return 'error'
142 |     else:
143 |         return json_file
144 | 
145 | 
146 | def get_filename(path):
147 |     filename = path.split('/')[-1].split('.')[0]
148 |     return filename
149 | 
150 | 
151 | class Job:
152 |     id = None
153 |     appId = None
154 | 
155 |     def __init__(self, s):
156 |         self.__dict__ = json.loads(s)
157 | 
158 |     def dump(self):
159 |         print(self.id, self.appId)
160 | 
161 | 
162 | class JobStatus:
163 |     id = None
164 |     status = None
165 | 
166 |     def __init__(self, s):
167 |         self.__dict__ = json.loads(s)
168 | 
169 |     def dump(self):
170 |         print(self.id, self.status)
171 | 
172 | 
173 | class JobExport:
174 |     status = None
175 |     path = None
176 | 
177 |     def __init__(self, s):
178 |         self.__dict__ = json.loads(s)
179 | 
180 |     def dump(self):
181 |         print(self.status, self.path)


--------------------------------------------------------------------------------