├── .gitignore
├── LICENSE
├── README.md
├── config
├── __init__.py
└── settings.py
├── db
├── __init__.py
├── data.sql
├── models.py
└── mysql_connector.py
├── qichacha.py
├── qichacha
├── __init__.py
├── client.py
├── crawler.py
└── manager.py
├── requirements.txt
├── tianyancha.py
├── tianyancha
├── __init__.py
├── client.py
├── crawler.py
└── tyc_rest_api.py
└── util
├── __init__.py
├── date.py
├── httpclient.py
├── log.py
└── wechat_auth.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .idea/*
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *.cover
48 | .hypothesis/
49 | .pytest_cache/
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | logs
56 |
57 | *.log.*
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 |
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 |
67 | # Scrapy stuff:
68 | .scrapy
69 |
70 | # Sphinx documentation
71 | docs/_build/
72 |
73 | # PyBuilder
74 | target/
75 |
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 |
79 | # pyenv
80 | .python-version
81 | .DS_Store
82 |
83 | # celery beat schedule file
84 | celerybeat-schedule
85 |
86 | # SageMath parsed files
87 | *.sage.py
88 |
89 | # Environments
90 | .env
91 | .venv
92 | venv/
93 | ENV/
94 | env.bak/
95 | venv.bak/
96 |
97 | # Spyder project settings
98 | .spyderproject
99 | .spyproject
100 |
101 | # Rope project settings
102 | .ropeproject
103 |
104 | # mkdocs documentation
105 | /site
106 |
107 | # mypy
108 | .mypy_cache/
109 | *.json
110 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 albertx
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 天眼查、企查查公司信息爬虫
2 | ===
3 |
4 |
5 | ## 使用说明
6 |
7 | 1. 设置用户状态
8 |
9 | 抓包工具抓包天眼查、企查查小程序,设置请求头用户鉴权信息,在各自目录的__init__.py
文件中。可在此处配置随机UA,项目地址:[fake_useragent](https://github.com/hellysmile/fake-useragent)
10 |
11 | 2. 设置数据源
12 | ```pydocstring
13 | MYSQL_CONFIG = {
14 | 'develop': {
15 | 'host': '192.168.1.103',
16 | 'port': 3306,
17 | 'db': 'enterprise',
18 | 'username': 'root',
19 | 'password': 'root@123'
20 | }
21 | }
22 | ```
23 | 3. 执行```db/data.sql```生成数据结构
24 | 4. 配置IP代理```config/settings```, 开启global proxy前请先自行部署ip代理池,项目地址:[proxy_pool](https://github.com/jhao104/proxy_pool.git)
25 | ```pydocstring
26 | # 全局代理控制,
27 | GLOBAL_PROXY = True
28 | PROXY_POOL_URL = "http://localhost:5010"
29 | ```
30 | 5. 设置爬取关键字```qichacha```&```tianyancha```
31 | ```pydocstring
32 | keys = ['Google'] # 设置爬取列表
33 | crawler.load_keys(keys)
34 | crawler.start()
35 | ```
36 |
37 |
38 | ## Schedule List
39 | |功能|日期|状态|备注|
40 | |---|---|---|---|
41 | |鉴权Token提取||待完成||
42 | |内置IP代理||待完成||
43 | |防封策略||待完成||
44 | |容器化运行||待完成||
45 |
46 |
47 |
48 |
49 | Please Kindly Note That
50 | ===
51 |
52 | 程序员技术交流tg群,欢迎大家加入!!!
53 |
54 | 内有技术交流!工作内推!远程工作!兼职、私活儿!!。
55 |
56 | Telegram群链接:[程序员社区https://t.me/+iZK2y8zMUiE0NDE1](https://t.me/+iZK2y8zMUiE0NDE1)
57 |
58 | 群二维码:
59 |
60 |
61 |
--------------------------------------------------------------------------------
/config/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: lubosin
5 | :date: 03/28/2019
6 | """
7 | import logging as log
8 | import sys
9 |
10 | import config
11 | from config.settings import *
12 |
13 | sys.path.append("..")
14 |
15 |
16 | class MysqlEnviron:
17 | CONFIG = MysqlConfig.get(ENV)
18 | if not CONFIG:
19 | log.error('no active environment')
20 | exit(0)
21 |
22 | @property
23 | def host(self):
24 | return self.CONFIG.get('host')
25 |
26 | @property
27 | def port(self):
28 | return self.CONFIG.get('port')
29 |
30 | @property
31 | def database(self):
32 | return self.CONFIG.get('db')
33 |
34 | @property
35 | def username(self):
36 | return self.CONFIG.get('username')
37 |
38 | @property
39 | def password(self):
40 | return self.CONFIG.get('password')
41 |
42 |
43 |
--------------------------------------------------------------------------------
/config/settings.py:
--------------------------------------------------------------------------------
1 | # "dev", "test", "prod"
2 | ENV = "dev"
3 |
4 | # 全局代理控制
5 | GLOBAL_PROXY = True
6 | PROXY_POOL_URL = "http://127.0.0.1:5010"
7 |
8 | """ mysql 配置 """
9 | MysqlConfig = {
10 | 'dev': {
11 | 'host': '192.168.1.103',
12 | 'port': 3306,
13 | 'db': 'enterprise',
14 | 'password': 'root@123'
15 | },
16 | 'test': {
17 |
18 | 'username': 'root',
19 | },
20 | 'prod': {
21 |
22 | }
23 | }
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/db/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: albert
5 | :date: 02/28/2019
6 | :desc:
7 | """
8 | import sys
9 | sys.path.append('..')
10 |
--------------------------------------------------------------------------------
/db/data.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS `company`;
2 | CREATE TABLE `company` (
3 | `id` int(11) unsigned not NULL AUTO_INCREMENT primary key COMMENT 'PK',
4 | `name` varchar(128) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '公司名',
5 | `representative` varchar(40) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '法人代表',
6 | `address` varchar(200) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '公司地址',
7 | `region` varchar(15) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '所属地区(省)',
8 | `city` varchar(15) character set utf8mb4 null default '-' COMMENT '城市',
9 | `district` varchar(15) character set utf8mb4 null default '-' COMMENT '区/县',
10 | `geoloc` varchar(80) character set utf8mb4 null default '-'
11 | comment '经纬度,json -> {"lat": "30.18484477830133", "long": "120.06383340659741"}',
12 | `biz_status` varchar(20) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '经营状态',
13 | `credit_code` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '统一社会信用代码',
14 | `register_code` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '注册号',
15 | `phone` varchar(20) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '电话',
16 | `email` varchar(50) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '邮箱',
17 | `setup_time` varchar(20) NULL DEFAULT '-' COMMENT '成立时间',
18 | `industry` varchar(64) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '所属行业',
19 | `biz_scope` varchar(1200) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '经营范围',
20 | `company_type` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '公司类型',
21 | `registered_capital` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '注册资本',
22 | `actual_capital` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '实缴资本',
23 | `taxpayer_code` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '纳税人识别号',
24 | `organization_code` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '组织机构代码',
25 | `english_name` varchar(128) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '公司英文名',
26 | `authorization` varchar(64) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '登记机关',
27 | `homepage` varchar(64) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '公司官网',
28 | `used_name` varchar(500) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '公司曾用名',
29 | `search_key` varchar(64) character set utf8mb4 null default '-' comment '搜索关键字',
30 | `create_at` timestamp not NULL DEFAULT CURRENT_TIMESTAMP COMMENT '插入时间',
31 | `modify_at` timestamp DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP COMMENT '最后操作时间',
32 | # index un_key() comment '联合索引',
33 | unique key uq_credit_reg_code(`credit_code`, `register_code`)
34 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '企业信息表';
35 |
36 | # 股东信息
37 | drop table if exists `dim_shareholder`;
38 | create table `dim_shareholder`(
39 | `id` integer not null primary key auto_increment comment 'pk',
40 | `credit_code` varchar(255) default null comment '企业社会信用代码',
41 | `name` varchar(255) default null comment '股东名称',
42 | `alias` varchar(255) default null comment '别称',
43 | `avatar` varchar(255) default null comment '股东头像',
44 | `control_ratio` varchar(255) default null comment '股东控股比例',
45 | `tags` json default null comment '股东信息',
46 | constraint unique index unq_index(`credit_code`, `name`)
47 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '企业股东信息表';
48 |
49 | drop table if exists `dim_company_manager`;
50 | create table `dim_company_manager`(
51 | `id` integer not null primary key auto_increment comment 'pk',
52 | `credit_code` varchar(255) default null comment '企业社会信用代码',
53 | `name` varchar(255) default null comment '企业高管名称',
54 | `titles` json default null comment '高管title',
55 | `manager_type` varchar(255) default null comment '高管类型',
56 | constraint unique index unq_index(`credit_code`, `name`)
57 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '企业高管信息表';
58 |
59 | drop table if exists `province`;
60 | create table `province`(
61 | `id` integer unsigned not null primary key auto_increment comment 'pk',
62 | `simple` char(3) null default 'CN' comment '省份拼音简写',
63 | `code` varchar(6) null default '000000' comment '全国代码',
64 | `name` varchar(10) null default '全国' comment '省份中文',
65 | index idx_code(`code`)
66 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '地区省份表';
67 |
68 | drop table if exists `city`;
69 | create table `city`(
70 | `id` integer unsigned not null primary key auto_increment comment 'pk',
71 | `parent` varchar(6) null comment '父级省',
72 | `code` varchar(6) null comment '市、区级代码',
73 | `name` varchar(10) null comment '市、区级名',
74 | index un_key(`parent`, `code`)
75 | ) ENGINE = InnoDB default CHARSET = utf8mb4 COMMENT '市区级表';
76 |
77 | # drop table if exists `keyword`;
78 | # create table `keyword` (
79 | # `id` int(11) unsigned not null auto_increment primary key comment 'pk',
80 | # `name` varchar(40) character set utf8mb4 null comment '关键字',
81 | # `status` tinyint(1) unsigned null default 0 comment '状态, 0: 未爬取,1: 爬取中,2: 已爬取,3: 爬取失败, 4: 丢弃',
82 | # `insert_at` timestamp not null default current_timestamp() comment '添加时间'
83 | # ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '关键字表';
84 |
85 |
--------------------------------------------------------------------------------
/db/models.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # @author bouxin
4 | # @since 2019-09-27
5 | # @description --
6 |
7 |
8 | class Company(object):
9 | def __init__(self):
10 | self.id = None
11 | self.name = None
12 | self.short_name = None
13 | self.representative = None
14 | self.found_time = None
15 | self.company_address = None
16 | self.register_address = None
17 | self.province = None
18 | self.city = None
19 | self.district = None
20 | self.biz_status = None
21 | # lat-long:: eg. {'latitude': '12.0023', 'longitude': '120.180'}
22 | self.geoloc = None
23 | self.emails = None
24 | self.phones = None
25 | self.contact = None
26 | self.biz_scope = None
27 | self.company_type = None
28 | self.score = 50.00
29 | self.register_capital = None
30 | self.websites = None
31 | self.credit_code = None
32 | self.taxpayer_code = None
33 | self.register_code = None
34 | self.organization_code = None
35 | self.tags = None
36 | self.industry = None
37 | self.keyword = None
38 | self.logo = None
39 | self.company_desc = None
40 | self.financing_round = None
41 | self.competitions = None
42 | self.english_name = None
43 | self.register_institute = None
44 | self.actual_capital = None
45 | self.used_name = None
46 | self.staffs = 1
47 | self.tax_address = None
48 | self.taxpayer_bank = None
49 | self.portraits = None
50 | self.shareholders = []
51 | self.managers = []
52 |
53 | def __str__(self) -> str:
54 | return ', '.join('%s: %s' % elem for elem in self.__dict__.items())
55 |
56 |
57 | class CompanyShareholder(object):
58 | def __init__(self):
59 | self.name = None
60 | self.alias = None
61 | self.avatar = None
62 | self.control_ratio = None
63 | self.tags = []
64 |
65 | def __str__(self) -> str:
66 | return ', '.join('%s: %s' % elem for elem in self.__dict__.items())
67 |
68 |
69 | class CompanyManager(object):
70 | def __init__(self):
71 | self.name = None
72 | self.titles = []
73 | self.manager_type = None
74 |
75 | def __str__(self) -> str:
76 | return ', '.join('%s: %s' % elem for elem in self.__dict__.items())
77 |
78 |
79 | class Province(object):
80 | def __init__(self):
81 | self.id = None
82 | self.code = 000000
83 | self.name = '全国'
84 | self.simple = 'CN'
85 |
86 | def __str__(self) -> str:
87 | return ', '.join('%s: %s' % elem for elem in self.__dict__.items())
88 |
89 |
90 | class City(object):
91 | def __init__(self):
92 | self.id = None
93 | self.parent = 000000
94 | self.code = 000000
95 | self.name = '市区'
96 |
97 | def __str__(self) -> str:
98 | return ', '.join('%s: %s' % elem for elem in self.__dict__.items())
99 |
100 |
--------------------------------------------------------------------------------
/db/mysql_connector.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: albert
5 | :date: 03/07/2019
6 | """
7 |
8 |
9 | from dbutils.pooled_db import PooledDB
10 | from config import MysqlEnviron
11 | import logging as log
12 | import pymysql
13 |
14 |
15 | connection_pool = PooledDB(creator=pymysql,
16 | maxconnections=20,
17 | host=MysqlEnviron.host,
18 | port=MysqlEnviron.port,
19 | db=MysqlEnviron.database,
20 | user=MysqlEnviron.username,
21 | passwd=MysqlEnviron.password)
22 |
23 |
24 | def insert_company(data: list):
25 | """
26 | 插入操作
27 | :param data:
28 | :return:
29 | """
30 | sql = 'insert into `company`(`name`,`representative`,`address`,`region`,`city`,`district`,' \
31 | '`geoloc`,`biz_status`,`credit_code`,`register_code`,`phone`,`email`,`setup_time`,' \
32 | '`industry`, `biz_scope`,`company_type`,`registered_capital`,`actual_capital`,' \
33 | '`taxpayer_code`, `organization_code`,`english_name`,`authorization`,`homepage`,' \
34 | '`used_name`,`create_at`, `modify_at`, `search_key`) ' \
35 | 'values(%(name)s,%(representative)s,%(address)s,%(region)s,%(city)s,%(district)s,' \
36 | '%(geoloc)s,%(biz_status)s,%(credit_code)s,%(register_code)s,%(phone)s,%(email)s,' \
37 | '%(setup_time)s, %(industry)s,%(biz_scope)s,%(company_type)s,%(registered_capital)s,' \
38 | '%(actual_capital)s, %(taxpayer_code)s,%(organization_code)s,%(english_name)s,' \
39 | '%(authorization)s,%(homepage)s, %(used_name)s,now(),now(), %(keyword)s) ' \
40 | 'on duplicate key update `name`=%(name)s,`representative`=%(representative)s,' \
41 | '`address`=%(address)s,`region`=%(region)s,`geoloc`=%(geoloc)s,' \
42 | '`biz_status`=%(biz_status)s,`credit_code`=%(credit_code)s,' \
43 | '`register_code`=%(register_code)s,`phone`=%(phone)s,`email`=%(email)s,' \
44 | '`setup_time`=%(setup_time)s,`industry`=%(industry)s,`biz_scope`=%(biz_scope)s,' \
45 | '`company_type`=%(company_type)s,`registered_capital`=%(registered_capital)s,' \
46 | '`actual_capital`=%(actual_capital)s,`taxpayer_code`=%(taxpayer_code)s,' \
47 | '`organization_code`=%(organization_code)s,`english_name`=%(english_name)s,' \
48 | '`authorization`=%(authorization)s,`homepage`=%(homepage)s,`used_name`=%(used_name)s,' \
49 | '`modify_at`=now()'
50 | for company in data:
51 | managers = company.managers
52 | shareholders = company.shareholders
53 | write(sql, company)
54 | insert_company_manager(managers)
55 | insert_company_shareholder(shareholders)
56 |
57 |
58 | def insert_company_shareholder(data: list):
59 | sql = 'insert into `dim_shareholder`(`credit_code`, `name`, `alias`, `avatar`, `control_ratio`, `tags`) ' \
60 | 'values (%(credit_code)s, %s(name)s, %(alias)s, %(avatar)s, %(control_ratio)s, %(tags)s) ' \
61 | 'on duplicate key update `name`=%(name)s, `alias`=%(alias)s, `avatar`=%(avatar)s, ' \
62 | '`control_ratio`=%(control_ratio)s, `tags`=%(tags)s'
63 | for shareholder in data:
64 | return write(sql, shareholder)
65 |
66 |
67 | def insert_company_manager(data: list):
68 | sql = 'insert into `dim_company_manager`(`credit_code`, `name`, `titles`, `manager_type`) ' \
69 | 'values (%(credit_code)s, %(name)s, %(titles)s, %(manager_type)s)' \
70 | 'on duplicate key update `name`=%(name)s, `titles`=%(titles)s, `manager_type`=%(manager_type)s'
71 | for manager in data:
72 | return write(sql, manager)
73 |
74 |
75 | def write(sql: str, data: any):
76 | connection = connection_pool.connection()
77 | cursor = connection.cursor()
78 | result = cursor.execute(sql, data)
79 |
80 | try:
81 | connection.commit()
82 | except RuntimeError as error:
83 | connection.rollback()
84 | log.error('Insertion Error!')
85 | raise error
86 |
87 | return result
88 |
89 |
--------------------------------------------------------------------------------
/qichacha.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: lubosson
5 | :date: 2019-04-16
6 | :desc:
7 | """
8 | from qichacha import crawler as QccCrawler
9 | from util import log
10 | import urllib3
11 | urllib3.disable_warnings()
12 |
13 |
14 | log.set_file("./logs/qichacha.log")
15 | app = QccCrawler
16 |
17 | if __name__ == '__main__':
18 | keys = ['Google中国']
19 | app.load_keys(keys)
20 | app.start()
21 |
22 |
--------------------------------------------------------------------------------
/qichacha/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: lubosson
5 | :date: 2019-04-15
6 | :desc:
7 | """
8 | import sys
9 | sys.path.append('..')
10 |
11 | """ 关键字搜索API """
12 | SEARCH_API = "https://xcx.qichacha.com/wxa/v1/base/advancedSearchNew"
13 | """ 企业详情API """
14 | COMPANY_DETAIL_API = "https://xcx.qichacha.com/wxa/v1/base/getEntDetail"
15 | """ 地区代码列表 """
16 | AREA_API = "https://xcx.qichacha.com/wxa/v1/admin/getAreaList"
17 | """ web浏览器no-login COOKIE """
18 | COOKIE = "zg_did=%7B%22did%22%3A%20%22168dbc0b22f6e5-0d361e70cfef92-10306653-13c680-168dbc0b23013bd%22%7D; _uab_collina=154987506595105102560196; acw_tc=78c7474915498750659746725e47bcf5da5e01750eaa818d83d5019d1f; saveFpTip=true; UM_distinctid=168e101305e193-0665042ea0cf1-133b6850-13c680-168e101305f37d; CNZZDATA1254842228=1871928231-1549959491-https%253A%252F%252Fwww.qichacha.com%252F%7C1549959491; QCCSESSID=780j6eils4m98fspmr9cvtc9p5; hasShow=1; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201551756182960%2C%22updated%22%3A%201551756803803%2C%22info%22%3A%201551242110203%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%22fc6fca91d248e7cf976bd652db7e11c6%22%7D"
19 | USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"
20 | """ 伪装请求头,更多参数抓包qcc小程序 """
21 | REQUEST_HEADERS = {
22 | "User-Agent": USER_AGENT,
23 | "Cookie": COOKIE
24 | }
25 | """
26 | 授权企查查小程序返回TOKEN 过期时间1h, 自行更新
27 | 可走代理方式模拟应用登陆获取该token
28 | """
29 | TOKEN = "9a62aaad7cda6c73a35d598f93e8d169"
30 |
31 |
--------------------------------------------------------------------------------
/qichacha/client.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # @author lubosson
4 | # @since 2019-09-27
5 | # @description --
6 | import json
7 | from time import sleep
8 | from qichacha import *
9 | from util.httpclient import Request
10 |
11 |
12 | class QichachaClient:
13 | @staticmethod
14 | def search(keyword: str) -> list:
15 | results = []
16 | if keyword:
17 | payload = {
18 | "searchKey": keyword,
19 | "token": TOKEN,
20 | "pageIndex": 1, # 每个关键字默认获取第一页数据共20条
21 | "searchType": 0,
22 | "isSortAsc": False
23 | }
24 | data = Request(SEARCH_API, params=payload, headers=REQUEST_HEADERS).data
25 | sleep(2)
26 | if data:
27 | data = json.loads(data)
28 | if data.get('status') == 200:
29 | results.append(data.get('result', {}).get('Result', []))
30 | return results
31 |
32 | @staticmethod
33 | def search_detail(key_no):
34 | detail = dict()
35 | if key_no:
36 | payload = {
37 | "token": TOKEN,
38 | "unique": key_no
39 | }
40 | data = Request(url=COMPANY_DETAIL_API, params=payload, headers=REQUEST_HEADERS).data
41 | sleep(2)
42 |
43 | if data:
44 | data = json.loads(data)
45 | if data.get('status') == 200:
46 | detail = data.json().get('result', {}).get('Company', {})
47 | return detail
48 |
49 |
50 |
--------------------------------------------------------------------------------
/qichacha/crawler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: lubosson
5 | :date: 2019-04-15
6 | :desc:
7 | """
8 | import logging as log
9 | from qichacha.client import QichachaClient
10 | from qichacha.manager import QichachaManager
11 | from db.models import Company
12 |
13 | # 企查查客户端
14 | qcc_client = QichachaClient()
15 | manager = QichachaManager()
16 |
17 |
18 | def start():
19 | keywords = globals().get('keywords')
20 | for keyword in keywords:
21 | raw_companies = qcc_client.search(keyword)
22 | log.info('正在处理爬取[%s]' % keyword)
23 | # company对象
24 | company = Company()
25 | for raw_company in raw_companies:
26 | company.keyword = keyword
27 | # 组装公司信息
28 | manager.assembly(company, raw_company)
29 | raw_company_detail = qcc_client.search_detail(raw_company.get('KeyNo'))
30 | # 补充公司详细信息
31 | manager.assembly_detail(company, raw_company_detail)
32 | log.info(company)
33 | log.info('completed')
34 |
35 |
36 | def load_keys(keys: list):
37 | globals().setdefault('keywords', keys)
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/qichacha/manager.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # @author lubosson
4 | # @since 2019-09-27
5 | # @description --
6 | from db.models import Company
7 |
8 |
9 | class QichachaManager(object):
10 | @classmethod
11 | def assembly(cls, company: Company, raw_company: dict):
12 | company.name = raw_company.get('Name', '-')
13 | company.representative = raw_company.get('OperName', '-')
14 | company.address = raw_company.get('Address', '-')
15 | company.region = raw_company.get('AreaCode', '-') # todo
16 | company.city = raw_company.get('AreaCode', '-') # todo
17 | company.district = raw_company.get('AreaCode', '-') # todo
18 | company.biz_status = raw_company.get('Status', '-')
19 | company.credit_code = raw_company.get('CreditCode', '-')
20 | company.email = raw_company.get('Email', '-')
21 | company.phone = raw_company.get('ContactNumber', '-')
22 | company.biz_scope = raw_company.get('Scope', '-')
23 | company.company_type = raw_company.get('EconKind', '-')
24 | company.taxpayer_code = raw_company.get('CreditCode', '-')
25 | company.registered_capital = raw_company.get('RegistCapi', '-')
26 | company.lat_long = str({
27 | 'lat': raw_company.get('X', '-'),
28 | 'long': raw_company.get('Y', '-')
29 | })
30 | company.setup_time = raw_company.get('StartDate', '-')
31 |
32 | @classmethod
33 | def assembly_detail(cls, company: Company, raw_company_detail: dict):
34 | company.homepage = raw_company_detail.get('WebSite', '-')[0:30]
35 | company.register_code = raw_company_detail.get('No', '-')
36 | company.organization_code = raw_company_detail.get('OrgNo', '-')
37 | company.english_name = raw_company_detail.get('EnglishName', '-')
38 | company.authorization = raw_company_detail.get('BelongOrg', '-')
39 | company.actual_capital = raw_company_detail.get('RealCapi', '缺省')
40 | company.industry = raw_company_detail.get('Industry', dict()).get('Industry', '-')
41 | company.used_name = raw_company_detail.get('OriginalName', '-')
42 |
43 |
44 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | certifi
2 | chardet
3 | DBUtils
4 | idna
5 | PyMySQL
6 | redis
7 | requests
8 | urllib3
9 | uplink
--------------------------------------------------------------------------------
/tianyancha.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: lubosin
5 | :date: 03/28/2019
6 | """
7 | from tianyancha import crawler
8 | from util import log
9 | import urllib3
10 | urllib3.disable_warnings()
11 |
12 |
13 | log.set_file("./logs/tianyancha.log")
14 |
15 |
16 | if __name__ == '__main__':
17 | keys = ['谷歌']
18 | crawler.load_keys(keys)
19 | crawler.start()
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/tianyancha/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: albert
5 | :date: 03/08/2019
6 | """
7 | import sys
8 | sys.path.append('..')
9 |
10 | TycQueryApi = "https://api9.tianyancha.com/services/v3/search/sNorV3/{q}"
11 | TycPortraitApi = "https://api9.tianyancha.com/services/v3/t/common/baseinfoV5/{eid}"
12 | TycShareholderPostApi = "https://capi.tianyancha.com/cloud-facade/company/familyBucket"
13 | TycEnterpriseManagerPostApi = "https://capi.tianyancha.com/cloud-facade/company/familyBucket"
14 | TycEnterpriseRiskApi = "https://api9.tianyancha.com/services/v3/risk/companyRiskInfoV4?{id}"
15 |
16 | """ 请求验证头 """
17 | AUTHORIZATION = '0###oo34J0VKzLlpdvf8kgFkMlfU_IPY###1642087379312###22494f3155c2e5a4be76e503837fa439'
18 | """ 请求token """
19 | X_AUTH_TOKEN = "eyJkaXN0aW5jdF9pZCI6IjE3ZDFjNWVhMzZjNGY2LTA5ZjU2NWUwNWViNTZjLTFjMzA2ODUxLTIwNzM2MDAtMTdkMWM1ZWEzNmRiMzYiLCJsaWIiOnsiJGxpYiI6ImpzIiwiJGxpYl9tZXRob2QiOiJjb2RlIiwiJGxpYl92ZXJzaW9uIjoiMS4xNS4yNCJ9LCJwcm9wZXJ0aWVzIjp7IiR0aW1lem9uZV9vZmZzZXQiOi00ODAsIiRzY3JlZW5faGVpZ2h0IjoxMDgwLCIkc2NyZWVuX3dpZHRoIjoxOTIwLCIkbGliIjoianMiLCIkbGliX3ZlcnNpb24iOiIxLjE1LjI0IiwiJGxhdGVzdF90cmFmZmljX3NvdXJjZV90eXBlIjoi6Ieq54S25pCc57Si5rWB6YePIiwiJGxhdGVzdF9zZWFyY2hfa2V5d29yZCI6IuacquWPluWIsOWAvCIsIiRsYXRlc3RfcmVmZXJyZXIiOiJodHRwczovL3d3dy5nb29nbGUuY29tLyIsImN1cnJlbnRfdXJsIjoiaHR0cHM6Ly93d3cudGlhbnlhbmNoYS5jb20vc2VhcmNoP2tleT0lRTYlOUQlQUQlRTUlQjclOUUlRTYlOTklQUUlRTUlODUlQjQlRTQlQkMlODElRTQlQjglOUElRTclQUUlQTElRTclOTAlODYlRTUlOTAlODglRTQlQkMlOTklRTQlQkMlODElRTQlQjglOUEiLCJyZWZlcnJlciI6Imh0dHBzOi8vd3d3LnRpYW55YW5jaGEuY29tL3NlYXJjaD9rZXk9JUU2JTlEJUFEJUU1JUI3JTlFJUU2JTk5JUFFJUU1JTg1JUI0JUU0JUJDJTgxJUU0JUI4JTlBJUU3JUFFJUExJUU3JTkwJTg2JUU1JTkwJTg4JUU0JUJDJTk5JUU0JUJDJTgxJUU0JUI4JTlBIiwidHljaWQiOiI0MmMxZTY1MDQ0ZjYxMWVjYmIxZDY3ZmJiYzEwN2U3NSIsIm5hbWUiOiLmna3lt57mma7lhbTkvIHkuJrnrqHnkIblkIjkvJnkvIHkuJoiLCJtb2R1bGUiOiLkvJjotKjlrp7lkI3orqTor4EiLCIkaXNfZmlyc3RfZGF5IjpmYWxzZX0sImFub255bW91c19pZCI6IjE3ZDFjNWVhMzZjNGY2LTA5ZjU2NWUwNWViNTZjLTFjMzA2ODUxLTIwNzM2MDAtMTdkMWM1ZWEzNmRiMzYiLCJ0eXBlIjoidHJhY2siLCJldmVudCI6InNlYXJjaF9yZXN1bHRfZXhwdXJlIiwiX3RyYWNrX2lkIjo3MjUyNDM3Mjd9"
20 | """ 天眼查头信息 """
21 | REQUEST_HEADERS = {
22 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
23 | "version": "TYC-XCX-WX",
24 | "Host": "api9.tianyancha.com",
25 | "Authorization": AUTHORIZATION,
26 | 'x-auth-token': X_AUTH_TOKEN,
27 | }
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/tianyancha/client.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # @author sanfeng
4 | # @since 2019-09-27
5 | # @description --
6 | import json
7 | import logging
8 |
9 | from db.models import Company, CompanyShareholder, CompanyManager
10 | from tianyancha import *
11 | from urllib.parse import quote
12 | from util.httpclient import Request
13 |
14 |
15 | class TycClient:
16 | def __init__(self, payload=None):
17 | self.payload = payload
18 | self.keyword = None
19 | self.src = []
20 | self.brand_and_agencies = []
21 | self.companies = []
22 |
23 | def search(self, keyword: str):
24 | """
25 | 根据关键字搜索相关企业信息
26 | :param keyword: 关键字
27 | :return:
28 | """
29 | self.keyword = keyword
30 | if not self.payload:
31 | self.payload = {
32 | "pageNum": 1,
33 | "pageSize": 20,
34 | "sortType": 0
35 | }
36 | url = TycQueryApi.format(q=quote(keyword))
37 | data = Request(url, self.payload, headers=REQUEST_HEADERS).data
38 | if data:
39 | api_data = json.loads(data)
40 | if api_data.get("state") == 'ok':
41 | self.src = api_data.get("data", {}).get("companyList", [])
42 | self.brand_and_agencies = api_data.get("data", {}).get("brandAndAgencyList", [])
43 | self.__post_process__()
44 | else:
45 | logging.info("查询异常:[%s]" % api_data)
46 | return self
47 |
48 | def __post_process__(self):
49 | if not self.src:
50 | return
51 |
52 | company_list = self.src
53 | for company in company_list:
54 | company_entity = Company()
55 | # 公司检索的关键字
56 | company_entity.keyword = self.keyword
57 | # 公司主体基本信息
58 | self.EntityHelper.__basic_info__(company, company_entity)
59 |
60 | def is_equal(b_and_a):
61 | return company.get('id') == b_and_a.get('graphId')
62 |
63 | try:
64 | # 公司主体融资阶段、竟品信息
65 | brand_and_agency = filter(is_equal, self.brand_and_agencies).__next__()
66 | self.EntityHelper.__another_info__(brand_and_agency, company_entity)
67 | except:
68 | logging.warning('竟品信息获取失败!')
69 | pass
70 | """ 公司详情 """
71 | detail_resp = Request(TycPortraitApi.format(eid=company.get("id")), headers=REQUEST_HEADERS).data
72 | if detail_resp:
73 | company_portrait = json.loads(detail_resp)
74 | # 公司详情补充信息
75 | if company_portrait.get("state") == 'ok':
76 | self.EntityHelper.__additional__(company_portrait.get("data", {}), company_entity)
77 |
78 | shareholder_request_body = {
79 | "graphId": company.get("id"),
80 | "hkVersion": 1,
81 | "typeList": {
82 | "shareHolder": {
83 | "pageNum": 1,
84 | "pageSize": 20,
85 | "required": "true"
86 | }
87 | }
88 | }
89 | """ 股东信息 """
90 | shareholder_resp = Request(TycShareholderPostApi, method='post', json=shareholder_request_body, headers=REQUEST_HEADERS).data
91 | if shareholder_resp:
92 | company_shareholder = json.loads(shareholder_resp)
93 | # 公司详情补充信息
94 | if company_shareholder.get("state") == 'ok':
95 | self.EntityHelper.__shareholder__(company_shareholder.get("data", {}).get("shareHolder", {}), company_entity)
96 |
97 | manager_request_body = {
98 | "graphId": company.get("id"),
99 | "hkVersion": 1,
100 | "typeList": {
101 | "companyStaff": {
102 | "pageNum": 1,
103 | "pageSize": 20,
104 | "required": "true"
105 | }
106 | }
107 | }
108 | """ 高管信息 """
109 | manager_resp = Request(TycEnterpriseManagerPostApi, method='post', json=manager_request_body, headers=REQUEST_HEADERS).data
110 | if manager_resp:
111 | company_manager = json.loads(manager_resp)
112 | # 公司详情补充信息
113 | if company_manager.get("state") == 'ok':
114 | self.EntityHelper.__company_manager__(company_manager.get("data", {}).get("companyStaff", {}), company_entity)
115 | self.companies.append(company_entity)
116 |
117 | class EntityHelper:
118 | @staticmethod
119 | def __basic_info__(src: dict, target: Company):
120 | # 公司外部系统ID
121 | target.id = src.get('id', '-')
122 | # 公司名称
123 | target.name = src.get('name', '-').replace('', '').replace('', '')
124 | # 公司简称
125 | target.short_name = src.get('alias', '-')
126 | # 公司法人
127 | target.representative = src.get('legalPersonName', '-')
128 | # 公司成立时间
129 | target.found_time = src.get('estiblishTime', '-')[0:10]
130 | # 公司地址
131 | target.company_address = src.get('regLocation', '-')
132 | # 公司注册地址
133 | target.register_address = src.get('regLocation', '-')
134 | # 公司所在省份,例:浙江,北京,广东
135 | target.province = src.get('base', '-')
136 | # 公司所在市
137 | target.city = src.get('city', '-')
138 | # 公司所在区
139 | target.district = src.get('district', '-')
140 | # 公司经营状态
141 | target.biz_status = src.get('regStatus', '-')
142 | # 公司地址经纬度坐标
143 | target.geoloc = str({
144 | 'latitude': src.get('latitude', '-'),
145 | 'longitude': src.get('longitude', '-')
146 | })
147 | # 公司邮箱列表
148 | target.emails = src.get('emails', ['-']).split(';')[0].replace('\t', '')
149 | # 公司联系方式列表
150 | target.phones = src.get('phoneList', [])
151 | # 公司联系方式
152 | target.contact = src.get('phoneNum', '-')
153 | # 公司经营范围
154 | target.biz_scope = src.get('businessScope', '-')
155 | # 公司类型
156 | target.company_type = src.get('companyOrgType', '-').replace('\t', '')
157 | # 公司质量分数
158 | target.score = src.get('orginalScore', 0)
159 | # 公司注册资本
160 | target.register_capital = src.get('regCapital', '-')
161 | # 公司统一社会信用代码
162 | target.credit_code = src.get('creditCode', '-')
163 | # 公司纳税号
164 | target.taxpayer_code = src.get('taxCode')
165 | if not target.taxpayer_code:
166 | target.taxpayer_code = target.credit_code
167 | # 公司注册号
168 | target.register_code = src.get('regNumber', '-')
169 | # 公司组织机构代码
170 | target.organization_code = src.get('orgNumber', '-')
171 | # 公司标签列表
172 | target.tags = src.get('labelListV2', [])
173 | # 公司行业分类
174 | target.industry = src.get('categoryStr', '-')
175 |
176 | @staticmethod
177 | def __another_info__(brand_and_agency: dict, company: Company):
178 | # 公司融资轮次
179 | company.financing_round = brand_and_agency.get("round", "未知")
180 | # 公司竟品信息
181 | company.competitions = brand_and_agency.get("jingpinName", [])
182 | # 公司logo
183 | company.logo = brand_and_agency.get("logo")
184 | # 公司简介
185 | company.company_desc = brand_and_agency.get("intro")
186 |
187 | @staticmethod
188 | def __additional__(src: dict, company: Company):
189 | # 公司英文名
190 | company.english_name = src.get('property3')
191 | if not company.english_name:
192 | company.english_name = src.get('nameEn', '-')
193 | # 公司注册机构
194 | company.register_institute = src.get('regInstitute', '-')
195 | # 公司网站地址集
196 | company.websites = src.get('websiteList', '-')
197 | # 公司实缴资本
198 | company.actual_capital = src.get('actualCapital', '缺省')
199 | # 公司曾用名
200 | company.used_name = src.get('historyNames', '-')
201 | # 公司员工人数
202 | company.staffs = src.get('socialStaffNum', None)
203 | if not company.staffs:
204 | company.staffs = src.get('staffNum', 1)
205 | # 公司纳税地址
206 | company.tax_address = src.get('taxAddress', None)
207 | if not company.tax_address:
208 | company.tax_address = src.get('regLocation', '-')
209 | # 公司纳税银行
210 | company.taxpayer_bank = src.get('taxBankName', '-')
211 | # 公司涉足领域标签
212 | company.portraits = src.get('portray', [])
213 | if not company.logo:
214 | company.logo = src.get('logo')
215 | if not company.company_desc:
216 | company.company_desc = src.get('baseInfo', '-')
217 |
218 | @staticmethod
219 | def __shareholder__(src: dict, company: Company):
220 | holder_list = src.get("holderList", [])
221 | for holder in holder_list:
222 | if holder:
223 | shareholder = CompanyShareholder()
224 | shareholder.name = holder.get("name")
225 | shareholder.alias = holder.get("alias")
226 | shareholder.avatar = holder.get("logo")
227 | shareholder.control_ratio = holder.get("proportion")
228 | shareholder.tags = [tag.get("name") for tag in holder.get("tagList", [])]
229 | company.shareholders.append(shareholder)
230 |
231 | @staticmethod
232 | def __company_manager__(src: dict, company: Company):
233 | manager_list = src.get("result", [])
234 | manager_type = src.get("staffTitle", "-")
235 | for manager in manager_list:
236 | company_manager = CompanyManager()
237 | company_manager.manager_type = manager_type
238 | company_manager.name = manager.get("name", "-")
239 | company_manager.titles = manager.get("typeJoin", [])
240 | company.managers.append(company_manager)
241 |
242 |
--------------------------------------------------------------------------------
/tianyancha/crawler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: albert
5 | :date: 03/08/2019
6 | """
7 | import logging
8 | from tianyancha.client import TycClient
9 | from db.mysql_connector import *
10 |
11 |
12 | def start():
13 | """ 入口函数 """
14 | def __printall(items):
15 | for elem in items:
16 | logging.info(elem.__str__())
17 |
18 | keys = globals().get('keywords', [])
19 | for key in keys:
20 | logging.info('正在采集[%s]...' % key)
21 | companies = TycClient().search(key).companies
22 | # 写入db
23 | # insert_company(companies)
24 | __printall(companies)
25 | logging.info("completed")
26 |
27 |
28 | def load_keys(keys: list):
29 | globals().setdefault('keywords', keys)
30 |
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/tianyancha/tyc_rest_api.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | import logging
5 |
6 | from uplink import *
7 |
8 | AUTHORIZATION = '0###oo34J0VKzLlpdvf8kgFkMlfU_IPY###1642087379312###22494f3155c2e5a4be76e503837fa439'
9 | """ 请求token """
10 | X_AUTH_TOKEN = "eyJkaXN0aW5jdF9pZCI6IjE3ZDFjNWVhMzZjNGY2LTA5ZjU2NWUwNWViNTZjLTFjMzA2ODUxLTIwNzM2MDAtMTdkMWM1ZWEzNmRiMzYiLCJsaWIiOnsiJGxpYiI6ImpzIiwiJGxpYl9tZXRob2QiOiJjb2RlIiwiJGxpYl92ZXJzaW9uIjoiMS4xNS4yNCJ9LCJwcm9wZXJ0aWVzIjp7IiR0aW1lem9uZV9vZmZzZXQiOi00ODAsIiRzY3JlZW5faGVpZ2h0IjoxMDgwLCIkc2NyZWVuX3dpZHRoIjoxOTIwLCIkbGliIjoianMiLCIkbGliX3ZlcnNpb24iOiIxLjE1LjI0IiwiJGxhdGVzdF90cmFmZmljX3NvdXJjZV90eXBlIjoi6Ieq54S25pCc57Si5rWB6YePIiwiJGxhdGVzdF9zZWFyY2hfa2V5d29yZCI6IuacquWPluWIsOWAvCIsIiRsYXRlc3RfcmVmZXJyZXIiOiJodHRwczovL3d3dy5nb29nbGUuY29tLyIsImN1cnJlbnRfdXJsIjoiaHR0cHM6Ly93d3cudGlhbnlhbmNoYS5jb20vc2VhcmNoP2tleT0lRTYlOUQlQUQlRTUlQjclOUUlRTYlOTklQUUlRTUlODUlQjQlRTQlQkMlODElRTQlQjglOUElRTclQUUlQTElRTclOTAlODYlRTUlOTAlODglRTQlQkMlOTklRTQlQkMlODElRTQlQjglOUEiLCJyZWZlcnJlciI6Imh0dHBzOi8vd3d3LnRpYW55YW5jaGEuY29tL3NlYXJjaD9rZXk9JUU2JTlEJUFEJUU1JUI3JTlFJUU2JTk5JUFFJUU1JTg1JUI0JUU0JUJDJTgxJUU0JUI4JTlBJUU3JUFFJUExJUU3JTkwJTg2JUU1JTkwJTg4JUU0JUJDJTk5JUU0JUJDJTgxJUU0JUI4JTlBIiwidHljaWQiOiI0MmMxZTY1MDQ0ZjYxMWVjYmIxZDY3ZmJiYzEwN2U3NSIsIm5hbWUiOiLmna3lt57mma7lhbTkvIHkuJrnrqHnkIblkIjkvJnkvIHkuJoiLCJtb2R1bGUiOiLkvJjotKjlrp7lkI3orqTor4EiLCIkaXNfZmlyc3RfZGF5IjpmYWxzZX0sImFub255bW91c19pZCI6IjE3ZDFjNWVhMzZjNGY2LTA5ZjU2NWUwNWViNTZjLTFjMzA2ODUxLTIwNzM2MDAtMTdkMWM1ZWEzNmRiMzYiLCJ0eXBlIjoidHJhY2siLCJldmVudCI6InNlYXJjaF9yZXN1bHRfZXhwdXJlIiwiX3RyYWNrX2lkIjo3MjUyNDM3Mjd9"
11 |
12 |
13 | def _response_handler(resp):
14 | """
15 | API接口响应参数处理器
16 | :return:
17 | """
18 | pass
19 |
20 |
21 | def _error_handler(exc_type, exc_val, exc_tb):
22 | """
23 | API错误响应处理器
24 | :return:
25 | """
26 | logging.info('type: ' + exc_type)
27 | logging.info('val: ' + exc_val)
28 | logging.info('tb: ' + exc_tb)
29 |
30 |
31 | @error_handler(_error_handler)
32 | @response_handler(_response_handler)
33 | @headers({
34 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
35 | "version": "TYC-XCX-WX",
36 | "Host": "api9.tianyancha.com",
37 | "Authorization": AUTHORIZATION,
38 | 'x-auth-token': X_AUTH_TOKEN,
39 | })
40 | class TianyanchaBasicInfo(Consumer):
41 | """
42 | 企业基本数据
43 | """
44 | def __init__(self, base_url="", client=None, converters=(), auth=None, hooks=(), **kwargs):
45 | if not base_url:
46 | base_url = "https://api9.tianyancha.com"
47 | super().__init__(base_url, client, converters, auth, hooks, **kwargs)
48 |
49 | @returns.json
50 | @get("/services/v3/search/sNorV3/{q}")
51 | def list_by_page(self, keyword: Path("q"), page_num: Query("pageNum"), page_size: Query("pageSize"), sort_type: Query("sortType")):
52 | """
53 | 根据关键字查询企业信息分页列表
54 | :param keyword:
55 | :param page_num:
56 | :param page_size:
57 | :param sort_type:
58 | :return:
59 | """
60 |
61 | @returns.json
62 | @get("/services/v3/t/common/baseinfoV5/{enterpriseId}")
63 | def get_enterprise_detail(self, enterprise_id: Path("enterpriseId")):
64 | """
65 | 查询企业信息详情
66 | :param enterprise_id:
67 | :return:
68 | """
69 |
70 | @returns.json
71 | @get("/services/v3/risk/companyRiskInfoV4")
72 | def get_enterprise_business_risk(self, enterprise_id: Query("id")):
73 | """
74 | 查询企业经营风险信息
75 | :param enterprise_id:
76 | :return:
77 | """
78 |
79 |
80 | @error_handler(_error_handler)
81 | @response_handler(_response_handler)
82 | @headers({
83 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
84 | "version": "TYC-XCX-WX",
85 | "Host": "capi.tianyancha.com",
86 | "Authorization": AUTHORIZATION,
87 | 'x-auth-token': X_AUTH_TOKEN,
88 | })
89 | class TianyanchaDimensional(Consumer):
90 | """
91 | 企业维度数据
92 | """
93 | def __init__(self, base_url="", client=None, converters=(), auth=None, hooks=(), **kwargs):
94 | if not base_url:
95 | base_url = "https://capi.tianyancha.com"
96 | super().__init__(base_url, client, converters, auth, hooks, **kwargs)
97 |
98 | def get_enterprise_shareholder_list(self, enterprise_id, page_num, page_size):
99 | """
100 | 查询企业股东信息
101 | :param enterprise_id:
102 | :param page_size:
103 | :param page_num:
104 | :return:
105 | """
106 | body = {
107 | "graphId": enterprise_id,
108 | "hkVersion": 1,
109 | "typeList": {
110 | "shareHolder": {
111 | "pageNum": page_num,
112 | "pageSize": page_size,
113 | "required": "true"
114 | }
115 | }
116 | }
117 | return self.__get_enterprise_shareholder_list(body)
118 |
119 | def get_enterprise_manager_list(self, enterprise_id, page_num, page_size):
120 | """
121 | 查询企业高管信息
122 | :param enterprise_id:
123 | :param page_num:
124 | :param page_size:
125 | :return:
126 | """
127 | req_body = {
128 | "graphId": enterprise_id,
129 | "hkVersion": 1,
130 | "typeList": {
131 | "companyStaff": {
132 | "pageNum": page_num,
133 | "pageSize": page_size,
134 | "required": "true"
135 | }
136 | }
137 | }
138 | return self.__get_enterprise_manager_list(req_body)
139 |
140 | @returns.json
141 | @post("/cloud-facade/company/familyBucket")
142 | def __get_enterprise_shareholder_list(self, **request_body: Body):
143 | """
144 | 查询企业股东信息
145 | :param request_body:
146 | :return:
147 | """
148 |
149 | @returns.json
150 | @post("/cloud-facade/company/familyBucket")
151 | def __get_enterprise_manager_list(self, **request_body: Body):
152 | """
153 | 查询企业高管
154 | :param request_body:
155 | :return:
156 | """
157 |
--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: albert
5 | :date: 02/28/2019
6 | :desc:
7 | """
8 | import sys
9 | sys.path.append('..')
10 |
--------------------------------------------------------------------------------
/util/date.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: lubosson
5 | :date: 2019-04-11
6 | :desc:
7 | """
8 | import datetime
9 |
10 |
11 | def datetime2timestamp(pytime: datetime.datetime.now()):
12 | ts = pytime.timestamp() * 1000
13 | return int(ts)
14 |
15 |
16 | def timestamp2datetime(timestamp: int):
17 | date = datetime.datetime.fromtimestamp(timestamp / 1000)
18 | return date
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/util/httpclient.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: albert
5 | :date: 02/28/2019
6 | :desc: http请求工具类
7 | """
8 | import logging
9 |
10 | import requests
11 |
12 | from config import GLOBAL_PROXY, PROXY_POOL_URL
13 |
14 |
15 | class Request:
16 | def __init__(self, url, method=None, params=None, proxy=True, **kwargs):
17 | self.proxy = proxy
18 | self.url = url
19 | self.params = params
20 | self.data = None
21 | self.method = method
22 | if self.method == 'post':
23 | self.post(**kwargs)
24 | else:
25 | self.get(**kwargs)
26 |
27 | def get(self, **kwargs):
28 | p = proxy() if GLOBAL_PROXY and self.proxy else None
29 | resp = requests.get(self.url, params=self.params, verify=False, proxies=p, **kwargs)
30 | if resp and resp.status_code == 200:
31 | self.data = resp.text
32 | else:
33 | logging.warning(resp)
34 |
35 | def post(self, **kwargs):
36 | p = proxy() if GLOBAL_PROXY and self.proxy else None
37 | resp = requests.post(self.url, verify=False, proxies=p, **kwargs)
38 | if resp and resp.status_code == 200:
39 | self.data = resp.text
40 | else:
41 | logging.warning(resp)
42 |
43 |
44 | def proxy():
45 | import json
46 | r = requests.get(f"{PROXY_POOL_URL}/get")
47 | if r and r.status_code == 200:
48 | p = json.loads(r.text)
49 | if p['https']:
50 | return {"http": "https://%s" % p.get("proxy")}
51 | else:
52 | return {"http": "http://%s" % p.get("proxy")}
53 |
54 |
55 | if __name__ == '__main__':
56 | print(proxy())
57 |
58 |
--------------------------------------------------------------------------------
/util/log.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*-: coding: utf-8 -*-
3 | """
4 | :author: lubosson
5 | :date: 2019-04-11
6 | :desc:
7 | """
8 | import logging
9 | import os
10 | from logging.handlers import TimedRotatingFileHandler
11 |
12 |
13 | def set_file(filename):
14 | logger = logging.getLogger()
15 | os.getcwd()
16 | handler = TimedRotatingFileHandler(filename, 'D', 1, 7)
17 | fmt = '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s'
18 | formatter = logging.Formatter(fmt=fmt, datefmt='%m/%d/%Y %H:%M:%S')
19 |
20 | handler.setFormatter(formatter)
21 | handler.setLevel(logging.INFO)
22 | # 屏幕输出
23 | console = logging.StreamHandler()
24 | console.setFormatter(formatter)
25 | console.setLevel(logging.INFO)
26 | logger.addHandler(console)
27 | logger.addHandler(handler)
28 | logger.setLevel(logging.INFO)
29 |
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/util/wechat_auth.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # @author lubosson
4 | # @since 2020-08-06
5 | # @description --
6 | import threading
7 |
8 |
9 | def auth_token() -> str:
10 | return "token"
11 |
12 |
13 | class WeChatAuthTask(threading.Thread):
14 | def __init__(self, func):
15 | super(WeChatAuthTask, self).__init__()
16 | self.func = func
17 | self.result = self.func
18 |
19 | def get(self):
20 | threading.Thread.join(self)
21 | try:
22 | return self.result
23 | except Exception:
24 | return None
25 |
26 | def run(self) -> str:
27 | return "token"
28 |
29 |
--------------------------------------------------------------------------------