├── .gitignore ├── LICENSE ├── README.md ├── config ├── __init__.py └── settings.py ├── db ├── __init__.py ├── data.sql ├── models.py └── mysql_connector.py ├── qichacha.py ├── qichacha ├── __init__.py ├── client.py ├── crawler.py └── manager.py ├── requirements.txt ├── tianyancha.py ├── tianyancha ├── __init__.py ├── client.py ├── crawler.py └── tyc_rest_api.py └── util ├── __init__.py ├── date.py ├── httpclient.py ├── log.py └── wechat_auth.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .idea/* 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | logs 56 | 57 | *.log.* 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | .DS_Store 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | *.json 110 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 albertx 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 天眼查、企查查公司信息爬虫 2 | === 3 | 4 | 5 | ## 使用说明 6 | 7 | 1. 设置用户状态 8 | 9 | 抓包工具抓包天眼查、企查查小程序,设置请求头用户鉴权信息,在各自目录的__init__.py文件中。可在此处配置随机UA,项目地址:[fake_useragent](https://github.com/hellysmile/fake-useragent) 10 | 11 | 2. 设置数据源 12 | ```pydocstring 13 | MYSQL_CONFIG = { 14 | 'develop': { 15 | 'host': '192.168.1.103', 16 | 'port': 3306, 17 | 'db': 'enterprise', 18 | 'username': 'root', 19 | 'password': 'root@123' 20 | } 21 | } 22 | ``` 23 | 3. 执行```db/data.sql```生成数据结构 24 | 4. 配置IP代理```config/settings```, 开启global proxy前请先自行部署ip代理池,项目地址:[proxy_pool](https://github.com/jhao104/proxy_pool.git) 25 | ```pydocstring 26 | # 全局代理控制, 27 | GLOBAL_PROXY = True 28 | PROXY_POOL_URL = "http://localhost:5010" 29 | ``` 30 | 5. 设置爬取关键字```qichacha```&```tianyancha``` 31 | ```pydocstring 32 | keys = ['Google'] # 设置爬取列表 33 | crawler.load_keys(keys) 34 | crawler.start() 35 | ``` 36 | 37 | 38 | ## Schedule List 39 | |功能|日期|状态|备注| 40 | |---|---|---|---| 41 | |鉴权Token提取||待完成|| 42 | |内置IP代理||待完成|| 43 | |防封策略||待完成|| 44 | |容器化运行||待完成|| 45 | 46 |
47 |
48 | 49 | Please Kindly Note That 50 | === 51 | 52 | 程序员技术交流tg群,欢迎大家加入!!! 53 | 54 | 内有技术交流!工作内推!远程工作!兼职、私活儿!!。 55 | 56 | Telegram群链接:[程序员社区https://t.me/+iZK2y8zMUiE0NDE1](https://t.me/+iZK2y8zMUiE0NDE1) 57 | 58 | 群二维码: 59 | 60 | 61 | -------------------------------------------------------------------------------- /config/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: lubosin 5 | :date: 03/28/2019 6 | """ 7 | import logging as log 8 | import sys 9 | 10 | import config 11 | from config.settings import * 12 | 13 | sys.path.append("..") 14 | 15 | 16 | class MysqlEnviron: 17 | CONFIG = MysqlConfig.get(ENV) 18 | if not CONFIG: 19 | log.error('no active environment') 20 | exit(0) 21 | 22 | @property 23 | def host(self): 24 | return self.CONFIG.get('host') 25 | 26 | @property 27 | def port(self): 28 | return self.CONFIG.get('port') 29 | 30 | @property 31 | def database(self): 32 | return self.CONFIG.get('db') 33 | 34 | @property 35 | def username(self): 36 | return self.CONFIG.get('username') 37 | 38 | @property 39 | def password(self): 40 | return self.CONFIG.get('password') 41 | 42 | 43 | -------------------------------------------------------------------------------- /config/settings.py: -------------------------------------------------------------------------------- 1 | # "dev", "test", "prod" 2 | ENV = "dev" 3 | 4 | # 全局代理控制 5 | GLOBAL_PROXY = True 6 | PROXY_POOL_URL = "http://127.0.0.1:5010" 7 | 8 | """ mysql 配置 """ 9 | MysqlConfig = { 10 | 'dev': { 11 | 'host': '192.168.1.103', 12 | 'port': 3306, 13 | 'db': 'enterprise', 14 | 'password': 'root@123' 15 | }, 16 | 'test': { 17 | 18 | 'username': 'root', 19 | }, 20 | 'prod': { 21 | 22 | } 23 | } 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /db/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: albert 5 | :date: 02/28/2019 6 | :desc: 7 | """ 8 | import sys 9 | sys.path.append('..') 10 | -------------------------------------------------------------------------------- /db/data.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS `company`; 2 | CREATE TABLE `company` ( 3 | `id` int(11) unsigned not NULL AUTO_INCREMENT primary key COMMENT 'PK', 4 | `name` varchar(128) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '公司名', 5 | `representative` varchar(40) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '法人代表', 6 | `address` varchar(200) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '公司地址', 7 | `region` varchar(15) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '所属地区(省)', 8 | `city` varchar(15) character set utf8mb4 null default '-' COMMENT '城市', 9 | `district` varchar(15) character set utf8mb4 null default '-' COMMENT '区/县', 10 | `geoloc` varchar(80) character set utf8mb4 null default '-' 11 | comment '经纬度,json -> {"lat": "30.18484477830133", "long": "120.06383340659741"}', 12 | `biz_status` varchar(20) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '经营状态', 13 | `credit_code` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '统一社会信用代码', 14 | `register_code` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '注册号', 15 | `phone` varchar(20) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '电话', 16 | `email` varchar(50) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '邮箱', 17 | `setup_time` varchar(20) NULL DEFAULT '-' COMMENT '成立时间', 18 | `industry` varchar(64) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '所属行业', 19 | `biz_scope` varchar(1200) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '经营范围', 20 | `company_type` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '公司类型', 21 | `registered_capital` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '注册资本', 22 | `actual_capital` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '实缴资本', 23 | `taxpayer_code` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '纳税人识别号', 24 | `organization_code` varchar(32) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '组织机构代码', 25 | `english_name` varchar(128) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '公司英文名', 26 | `authorization` varchar(64) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '登记机关', 27 | `homepage` varchar(64) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '公司官网', 28 | `used_name` varchar(500) CHARACTER SET utf8mb4 NULL DEFAULT '-' COMMENT '公司曾用名', 29 | `search_key` varchar(64) character set utf8mb4 null default '-' comment '搜索关键字', 30 | `create_at` timestamp not NULL DEFAULT CURRENT_TIMESTAMP COMMENT '插入时间', 31 | `modify_at` timestamp DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP COMMENT '最后操作时间', 32 | # index un_key() comment '联合索引', 33 | unique key uq_credit_reg_code(`credit_code`, `register_code`) 34 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '企业信息表'; 35 | 36 | # 股东信息 37 | drop table if exists `dim_shareholder`; 38 | create table `dim_shareholder`( 39 | `id` integer not null primary key auto_increment comment 'pk', 40 | `credit_code` varchar(255) default null comment '企业社会信用代码', 41 | `name` varchar(255) default null comment '股东名称', 42 | `alias` varchar(255) default null comment '别称', 43 | `avatar` varchar(255) default null comment '股东头像', 44 | `control_ratio` varchar(255) default null comment '股东控股比例', 45 | `tags` json default null comment '股东信息', 46 | constraint unique index unq_index(`credit_code`, `name`) 47 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '企业股东信息表'; 48 | 49 | drop table if exists `dim_company_manager`; 50 | create table `dim_company_manager`( 51 | `id` integer not null primary key auto_increment comment 'pk', 52 | `credit_code` varchar(255) default null comment '企业社会信用代码', 53 | `name` varchar(255) default null comment '企业高管名称', 54 | `titles` json default null comment '高管title', 55 | `manager_type` varchar(255) default null comment '高管类型', 56 | constraint unique index unq_index(`credit_code`, `name`) 57 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '企业高管信息表'; 58 | 59 | drop table if exists `province`; 60 | create table `province`( 61 | `id` integer unsigned not null primary key auto_increment comment 'pk', 62 | `simple` char(3) null default 'CN' comment '省份拼音简写', 63 | `code` varchar(6) null default '000000' comment '全国代码', 64 | `name` varchar(10) null default '全国' comment '省份中文', 65 | index idx_code(`code`) 66 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '地区省份表'; 67 | 68 | drop table if exists `city`; 69 | create table `city`( 70 | `id` integer unsigned not null primary key auto_increment comment 'pk', 71 | `parent` varchar(6) null comment '父级省', 72 | `code` varchar(6) null comment '市、区级代码', 73 | `name` varchar(10) null comment '市、区级名', 74 | index un_key(`parent`, `code`) 75 | ) ENGINE = InnoDB default CHARSET = utf8mb4 COMMENT '市区级表'; 76 | 77 | # drop table if exists `keyword`; 78 | # create table `keyword` ( 79 | # `id` int(11) unsigned not null auto_increment primary key comment 'pk', 80 | # `name` varchar(40) character set utf8mb4 null comment '关键字', 81 | # `status` tinyint(1) unsigned null default 0 comment '状态, 0: 未爬取,1: 爬取中,2: 已爬取,3: 爬取失败, 4: 丢弃', 82 | # `insert_at` timestamp not null default current_timestamp() comment '添加时间' 83 | # ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '关键字表'; 84 | 85 | -------------------------------------------------------------------------------- /db/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # @author bouxin 4 | # @since 2019-09-27 5 | # @description -- 6 | 7 | 8 | class Company(object): 9 | def __init__(self): 10 | self.id = None 11 | self.name = None 12 | self.short_name = None 13 | self.representative = None 14 | self.found_time = None 15 | self.company_address = None 16 | self.register_address = None 17 | self.province = None 18 | self.city = None 19 | self.district = None 20 | self.biz_status = None 21 | # lat-long:: eg. {'latitude': '12.0023', 'longitude': '120.180'} 22 | self.geoloc = None 23 | self.emails = None 24 | self.phones = None 25 | self.contact = None 26 | self.biz_scope = None 27 | self.company_type = None 28 | self.score = 50.00 29 | self.register_capital = None 30 | self.websites = None 31 | self.credit_code = None 32 | self.taxpayer_code = None 33 | self.register_code = None 34 | self.organization_code = None 35 | self.tags = None 36 | self.industry = None 37 | self.keyword = None 38 | self.logo = None 39 | self.company_desc = None 40 | self.financing_round = None 41 | self.competitions = None 42 | self.english_name = None 43 | self.register_institute = None 44 | self.actual_capital = None 45 | self.used_name = None 46 | self.staffs = 1 47 | self.tax_address = None 48 | self.taxpayer_bank = None 49 | self.portraits = None 50 | self.shareholders = [] 51 | self.managers = [] 52 | 53 | def __str__(self) -> str: 54 | return ', '.join('%s: %s' % elem for elem in self.__dict__.items()) 55 | 56 | 57 | class CompanyShareholder(object): 58 | def __init__(self): 59 | self.name = None 60 | self.alias = None 61 | self.avatar = None 62 | self.control_ratio = None 63 | self.tags = [] 64 | 65 | def __str__(self) -> str: 66 | return ', '.join('%s: %s' % elem for elem in self.__dict__.items()) 67 | 68 | 69 | class CompanyManager(object): 70 | def __init__(self): 71 | self.name = None 72 | self.titles = [] 73 | self.manager_type = None 74 | 75 | def __str__(self) -> str: 76 | return ', '.join('%s: %s' % elem for elem in self.__dict__.items()) 77 | 78 | 79 | class Province(object): 80 | def __init__(self): 81 | self.id = None 82 | self.code = 000000 83 | self.name = '全国' 84 | self.simple = 'CN' 85 | 86 | def __str__(self) -> str: 87 | return ', '.join('%s: %s' % elem for elem in self.__dict__.items()) 88 | 89 | 90 | class City(object): 91 | def __init__(self): 92 | self.id = None 93 | self.parent = 000000 94 | self.code = 000000 95 | self.name = '市区' 96 | 97 | def __str__(self) -> str: 98 | return ', '.join('%s: %s' % elem for elem in self.__dict__.items()) 99 | 100 | -------------------------------------------------------------------------------- /db/mysql_connector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: albert 5 | :date: 03/07/2019 6 | """ 7 | 8 | 9 | from dbutils.pooled_db import PooledDB 10 | from config import MysqlEnviron 11 | import logging as log 12 | import pymysql 13 | 14 | 15 | connection_pool = PooledDB(creator=pymysql, 16 | maxconnections=20, 17 | host=MysqlEnviron.host, 18 | port=MysqlEnviron.port, 19 | db=MysqlEnviron.database, 20 | user=MysqlEnviron.username, 21 | passwd=MysqlEnviron.password) 22 | 23 | 24 | def insert_company(data: list): 25 | """ 26 | 插入操作 27 | :param data: 28 | :return: 29 | """ 30 | sql = 'insert into `company`(`name`,`representative`,`address`,`region`,`city`,`district`,' \ 31 | '`geoloc`,`biz_status`,`credit_code`,`register_code`,`phone`,`email`,`setup_time`,' \ 32 | '`industry`, `biz_scope`,`company_type`,`registered_capital`,`actual_capital`,' \ 33 | '`taxpayer_code`, `organization_code`,`english_name`,`authorization`,`homepage`,' \ 34 | '`used_name`,`create_at`, `modify_at`, `search_key`) ' \ 35 | 'values(%(name)s,%(representative)s,%(address)s,%(region)s,%(city)s,%(district)s,' \ 36 | '%(geoloc)s,%(biz_status)s,%(credit_code)s,%(register_code)s,%(phone)s,%(email)s,' \ 37 | '%(setup_time)s, %(industry)s,%(biz_scope)s,%(company_type)s,%(registered_capital)s,' \ 38 | '%(actual_capital)s, %(taxpayer_code)s,%(organization_code)s,%(english_name)s,' \ 39 | '%(authorization)s,%(homepage)s, %(used_name)s,now(),now(), %(keyword)s) ' \ 40 | 'on duplicate key update `name`=%(name)s,`representative`=%(representative)s,' \ 41 | '`address`=%(address)s,`region`=%(region)s,`geoloc`=%(geoloc)s,' \ 42 | '`biz_status`=%(biz_status)s,`credit_code`=%(credit_code)s,' \ 43 | '`register_code`=%(register_code)s,`phone`=%(phone)s,`email`=%(email)s,' \ 44 | '`setup_time`=%(setup_time)s,`industry`=%(industry)s,`biz_scope`=%(biz_scope)s,' \ 45 | '`company_type`=%(company_type)s,`registered_capital`=%(registered_capital)s,' \ 46 | '`actual_capital`=%(actual_capital)s,`taxpayer_code`=%(taxpayer_code)s,' \ 47 | '`organization_code`=%(organization_code)s,`english_name`=%(english_name)s,' \ 48 | '`authorization`=%(authorization)s,`homepage`=%(homepage)s,`used_name`=%(used_name)s,' \ 49 | '`modify_at`=now()' 50 | for company in data: 51 | managers = company.managers 52 | shareholders = company.shareholders 53 | write(sql, company) 54 | insert_company_manager(managers) 55 | insert_company_shareholder(shareholders) 56 | 57 | 58 | def insert_company_shareholder(data: list): 59 | sql = 'insert into `dim_shareholder`(`credit_code`, `name`, `alias`, `avatar`, `control_ratio`, `tags`) ' \ 60 | 'values (%(credit_code)s, %s(name)s, %(alias)s, %(avatar)s, %(control_ratio)s, %(tags)s) ' \ 61 | 'on duplicate key update `name`=%(name)s, `alias`=%(alias)s, `avatar`=%(avatar)s, ' \ 62 | '`control_ratio`=%(control_ratio)s, `tags`=%(tags)s' 63 | for shareholder in data: 64 | return write(sql, shareholder) 65 | 66 | 67 | def insert_company_manager(data: list): 68 | sql = 'insert into `dim_company_manager`(`credit_code`, `name`, `titles`, `manager_type`) ' \ 69 | 'values (%(credit_code)s, %(name)s, %(titles)s, %(manager_type)s)' \ 70 | 'on duplicate key update `name`=%(name)s, `titles`=%(titles)s, `manager_type`=%(manager_type)s' 71 | for manager in data: 72 | return write(sql, manager) 73 | 74 | 75 | def write(sql: str, data: any): 76 | connection = connection_pool.connection() 77 | cursor = connection.cursor() 78 | result = cursor.execute(sql, data) 79 | 80 | try: 81 | connection.commit() 82 | except RuntimeError as error: 83 | connection.rollback() 84 | log.error('Insertion Error!') 85 | raise error 86 | 87 | return result 88 | 89 | -------------------------------------------------------------------------------- /qichacha.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: lubosson 5 | :date: 2019-04-16 6 | :desc: 7 | """ 8 | from qichacha import crawler as QccCrawler 9 | from util import log 10 | import urllib3 11 | urllib3.disable_warnings() 12 | 13 | 14 | log.set_file("./logs/qichacha.log") 15 | app = QccCrawler 16 | 17 | if __name__ == '__main__': 18 | keys = ['Google中国'] 19 | app.load_keys(keys) 20 | app.start() 21 | 22 | -------------------------------------------------------------------------------- /qichacha/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: lubosson 5 | :date: 2019-04-15 6 | :desc: 7 | """ 8 | import sys 9 | sys.path.append('..') 10 | 11 | """ 关键字搜索API """ 12 | SEARCH_API = "https://xcx.qichacha.com/wxa/v1/base/advancedSearchNew" 13 | """ 企业详情API """ 14 | COMPANY_DETAIL_API = "https://xcx.qichacha.com/wxa/v1/base/getEntDetail" 15 | """ 地区代码列表 """ 16 | AREA_API = "https://xcx.qichacha.com/wxa/v1/admin/getAreaList" 17 | """ web浏览器no-login COOKIE """ 18 | COOKIE = "zg_did=%7B%22did%22%3A%20%22168dbc0b22f6e5-0d361e70cfef92-10306653-13c680-168dbc0b23013bd%22%7D; _uab_collina=154987506595105102560196; acw_tc=78c7474915498750659746725e47bcf5da5e01750eaa818d83d5019d1f; saveFpTip=true; UM_distinctid=168e101305e193-0665042ea0cf1-133b6850-13c680-168e101305f37d; CNZZDATA1254842228=1871928231-1549959491-https%253A%252F%252Fwww.qichacha.com%252F%7C1549959491; QCCSESSID=780j6eils4m98fspmr9cvtc9p5; hasShow=1; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201551756182960%2C%22updated%22%3A%201551756803803%2C%22info%22%3A%201551242110203%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%22fc6fca91d248e7cf976bd652db7e11c6%22%7D" 19 | USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36" 20 | """ 伪装请求头,更多参数抓包qcc小程序 """ 21 | REQUEST_HEADERS = { 22 | "User-Agent": USER_AGENT, 23 | "Cookie": COOKIE 24 | } 25 | """ 26 | 授权企查查小程序返回TOKEN 过期时间1h, 自行更新 27 | 可走代理方式模拟应用登陆获取该token 28 | """ 29 | TOKEN = "9a62aaad7cda6c73a35d598f93e8d169" 30 | 31 | -------------------------------------------------------------------------------- /qichacha/client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # @author lubosson 4 | # @since 2019-09-27 5 | # @description -- 6 | import json 7 | from time import sleep 8 | from qichacha import * 9 | from util.httpclient import Request 10 | 11 | 12 | class QichachaClient: 13 | @staticmethod 14 | def search(keyword: str) -> list: 15 | results = [] 16 | if keyword: 17 | payload = { 18 | "searchKey": keyword, 19 | "token": TOKEN, 20 | "pageIndex": 1, # 每个关键字默认获取第一页数据共20条 21 | "searchType": 0, 22 | "isSortAsc": False 23 | } 24 | data = Request(SEARCH_API, params=payload, headers=REQUEST_HEADERS).data 25 | sleep(2) 26 | if data: 27 | data = json.loads(data) 28 | if data.get('status') == 200: 29 | results.append(data.get('result', {}).get('Result', [])) 30 | return results 31 | 32 | @staticmethod 33 | def search_detail(key_no): 34 | detail = dict() 35 | if key_no: 36 | payload = { 37 | "token": TOKEN, 38 | "unique": key_no 39 | } 40 | data = Request(url=COMPANY_DETAIL_API, params=payload, headers=REQUEST_HEADERS).data 41 | sleep(2) 42 | 43 | if data: 44 | data = json.loads(data) 45 | if data.get('status') == 200: 46 | detail = data.json().get('result', {}).get('Company', {}) 47 | return detail 48 | 49 | 50 | -------------------------------------------------------------------------------- /qichacha/crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: lubosson 5 | :date: 2019-04-15 6 | :desc: 7 | """ 8 | import logging as log 9 | from qichacha.client import QichachaClient 10 | from qichacha.manager import QichachaManager 11 | from db.models import Company 12 | 13 | # 企查查客户端 14 | qcc_client = QichachaClient() 15 | manager = QichachaManager() 16 | 17 | 18 | def start(): 19 | keywords = globals().get('keywords') 20 | for keyword in keywords: 21 | raw_companies = qcc_client.search(keyword) 22 | log.info('正在处理爬取[%s]' % keyword) 23 | # company对象 24 | company = Company() 25 | for raw_company in raw_companies: 26 | company.keyword = keyword 27 | # 组装公司信息 28 | manager.assembly(company, raw_company) 29 | raw_company_detail = qcc_client.search_detail(raw_company.get('KeyNo')) 30 | # 补充公司详细信息 31 | manager.assembly_detail(company, raw_company_detail) 32 | log.info(company) 33 | log.info('completed') 34 | 35 | 36 | def load_keys(keys: list): 37 | globals().setdefault('keywords', keys) 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /qichacha/manager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # @author lubosson 4 | # @since 2019-09-27 5 | # @description -- 6 | from db.models import Company 7 | 8 | 9 | class QichachaManager(object): 10 | @classmethod 11 | def assembly(cls, company: Company, raw_company: dict): 12 | company.name = raw_company.get('Name', '-') 13 | company.representative = raw_company.get('OperName', '-') 14 | company.address = raw_company.get('Address', '-') 15 | company.region = raw_company.get('AreaCode', '-') # todo 16 | company.city = raw_company.get('AreaCode', '-') # todo 17 | company.district = raw_company.get('AreaCode', '-') # todo 18 | company.biz_status = raw_company.get('Status', '-') 19 | company.credit_code = raw_company.get('CreditCode', '-') 20 | company.email = raw_company.get('Email', '-') 21 | company.phone = raw_company.get('ContactNumber', '-') 22 | company.biz_scope = raw_company.get('Scope', '-') 23 | company.company_type = raw_company.get('EconKind', '-') 24 | company.taxpayer_code = raw_company.get('CreditCode', '-') 25 | company.registered_capital = raw_company.get('RegistCapi', '-') 26 | company.lat_long = str({ 27 | 'lat': raw_company.get('X', '-'), 28 | 'long': raw_company.get('Y', '-') 29 | }) 30 | company.setup_time = raw_company.get('StartDate', '-') 31 | 32 | @classmethod 33 | def assembly_detail(cls, company: Company, raw_company_detail: dict): 34 | company.homepage = raw_company_detail.get('WebSite', '-')[0:30] 35 | company.register_code = raw_company_detail.get('No', '-') 36 | company.organization_code = raw_company_detail.get('OrgNo', '-') 37 | company.english_name = raw_company_detail.get('EnglishName', '-') 38 | company.authorization = raw_company_detail.get('BelongOrg', '-') 39 | company.actual_capital = raw_company_detail.get('RealCapi', '缺省') 40 | company.industry = raw_company_detail.get('Industry', dict()).get('Industry', '-') 41 | company.used_name = raw_company_detail.get('OriginalName', '-') 42 | 43 | 44 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi 2 | chardet 3 | DBUtils 4 | idna 5 | PyMySQL 6 | redis 7 | requests 8 | urllib3 9 | uplink -------------------------------------------------------------------------------- /tianyancha.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: lubosin 5 | :date: 03/28/2019 6 | """ 7 | from tianyancha import crawler 8 | from util import log 9 | import urllib3 10 | urllib3.disable_warnings() 11 | 12 | 13 | log.set_file("./logs/tianyancha.log") 14 | 15 | 16 | if __name__ == '__main__': 17 | keys = ['谷歌'] 18 | crawler.load_keys(keys) 19 | crawler.start() 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /tianyancha/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: albert 5 | :date: 03/08/2019 6 | """ 7 | import sys 8 | sys.path.append('..') 9 | 10 | TycQueryApi = "https://api9.tianyancha.com/services/v3/search/sNorV3/{q}" 11 | TycPortraitApi = "https://api9.tianyancha.com/services/v3/t/common/baseinfoV5/{eid}" 12 | TycShareholderPostApi = "https://capi.tianyancha.com/cloud-facade/company/familyBucket" 13 | TycEnterpriseManagerPostApi = "https://capi.tianyancha.com/cloud-facade/company/familyBucket" 14 | TycEnterpriseRiskApi = "https://api9.tianyancha.com/services/v3/risk/companyRiskInfoV4?{id}" 15 | 16 | """ 请求验证头 """ 17 | AUTHORIZATION = '0###oo34J0VKzLlpdvf8kgFkMlfU_IPY###1642087379312###22494f3155c2e5a4be76e503837fa439' 18 | """ 请求token """ 19 | X_AUTH_TOKEN = "eyJkaXN0aW5jdF9pZCI6IjE3ZDFjNWVhMzZjNGY2LTA5ZjU2NWUwNWViNTZjLTFjMzA2ODUxLTIwNzM2MDAtMTdkMWM1ZWEzNmRiMzYiLCJsaWIiOnsiJGxpYiI6ImpzIiwiJGxpYl9tZXRob2QiOiJjb2RlIiwiJGxpYl92ZXJzaW9uIjoiMS4xNS4yNCJ9LCJwcm9wZXJ0aWVzIjp7IiR0aW1lem9uZV9vZmZzZXQiOi00ODAsIiRzY3JlZW5faGVpZ2h0IjoxMDgwLCIkc2NyZWVuX3dpZHRoIjoxOTIwLCIkbGliIjoianMiLCIkbGliX3ZlcnNpb24iOiIxLjE1LjI0IiwiJGxhdGVzdF90cmFmZmljX3NvdXJjZV90eXBlIjoi6Ieq54S25pCc57Si5rWB6YePIiwiJGxhdGVzdF9zZWFyY2hfa2V5d29yZCI6IuacquWPluWIsOWAvCIsIiRsYXRlc3RfcmVmZXJyZXIiOiJodHRwczovL3d3dy5nb29nbGUuY29tLyIsImN1cnJlbnRfdXJsIjoiaHR0cHM6Ly93d3cudGlhbnlhbmNoYS5jb20vc2VhcmNoP2tleT0lRTYlOUQlQUQlRTUlQjclOUUlRTYlOTklQUUlRTUlODUlQjQlRTQlQkMlODElRTQlQjglOUElRTclQUUlQTElRTclOTAlODYlRTUlOTAlODglRTQlQkMlOTklRTQlQkMlODElRTQlQjglOUEiLCJyZWZlcnJlciI6Imh0dHBzOi8vd3d3LnRpYW55YW5jaGEuY29tL3NlYXJjaD9rZXk9JUU2JTlEJUFEJUU1JUI3JTlFJUU2JTk5JUFFJUU1JTg1JUI0JUU0JUJDJTgxJUU0JUI4JTlBJUU3JUFFJUExJUU3JTkwJTg2JUU1JTkwJTg4JUU0JUJDJTk5JUU0JUJDJTgxJUU0JUI4JTlBIiwidHljaWQiOiI0MmMxZTY1MDQ0ZjYxMWVjYmIxZDY3ZmJiYzEwN2U3NSIsIm5hbWUiOiLmna3lt57mma7lhbTkvIHkuJrnrqHnkIblkIjkvJnkvIHkuJoiLCJtb2R1bGUiOiLkvJjotKjlrp7lkI3orqTor4EiLCIkaXNfZmlyc3RfZGF5IjpmYWxzZX0sImFub255bW91c19pZCI6IjE3ZDFjNWVhMzZjNGY2LTA5ZjU2NWUwNWViNTZjLTFjMzA2ODUxLTIwNzM2MDAtMTdkMWM1ZWEzNmRiMzYiLCJ0eXBlIjoidHJhY2siLCJldmVudCI6InNlYXJjaF9yZXN1bHRfZXhwdXJlIiwiX3RyYWNrX2lkIjo3MjUyNDM3Mjd9" 20 | """ 天眼查头信息 """ 21 | REQUEST_HEADERS = { 22 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36", 23 | "version": "TYC-XCX-WX", 24 | "Host": "api9.tianyancha.com", 25 | "Authorization": AUTHORIZATION, 26 | 'x-auth-token': X_AUTH_TOKEN, 27 | } 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /tianyancha/client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # @author sanfeng 4 | # @since 2019-09-27 5 | # @description -- 6 | import json 7 | import logging 8 | 9 | from db.models import Company, CompanyShareholder, CompanyManager 10 | from tianyancha import * 11 | from urllib.parse import quote 12 | from util.httpclient import Request 13 | 14 | 15 | class TycClient: 16 | def __init__(self, payload=None): 17 | self.payload = payload 18 | self.keyword = None 19 | self.src = [] 20 | self.brand_and_agencies = [] 21 | self.companies = [] 22 | 23 | def search(self, keyword: str): 24 | """ 25 | 根据关键字搜索相关企业信息 26 | :param keyword: 关键字 27 | :return: 28 | """ 29 | self.keyword = keyword 30 | if not self.payload: 31 | self.payload = { 32 | "pageNum": 1, 33 | "pageSize": 20, 34 | "sortType": 0 35 | } 36 | url = TycQueryApi.format(q=quote(keyword)) 37 | data = Request(url, self.payload, headers=REQUEST_HEADERS).data 38 | if data: 39 | api_data = json.loads(data) 40 | if api_data.get("state") == 'ok': 41 | self.src = api_data.get("data", {}).get("companyList", []) 42 | self.brand_and_agencies = api_data.get("data", {}).get("brandAndAgencyList", []) 43 | self.__post_process__() 44 | else: 45 | logging.info("查询异常:[%s]" % api_data) 46 | return self 47 | 48 | def __post_process__(self): 49 | if not self.src: 50 | return 51 | 52 | company_list = self.src 53 | for company in company_list: 54 | company_entity = Company() 55 | # 公司检索的关键字 56 | company_entity.keyword = self.keyword 57 | # 公司主体基本信息 58 | self.EntityHelper.__basic_info__(company, company_entity) 59 | 60 | def is_equal(b_and_a): 61 | return company.get('id') == b_and_a.get('graphId') 62 | 63 | try: 64 | # 公司主体融资阶段、竟品信息 65 | brand_and_agency = filter(is_equal, self.brand_and_agencies).__next__() 66 | self.EntityHelper.__another_info__(brand_and_agency, company_entity) 67 | except: 68 | logging.warning('竟品信息获取失败!') 69 | pass 70 | """ 公司详情 """ 71 | detail_resp = Request(TycPortraitApi.format(eid=company.get("id")), headers=REQUEST_HEADERS).data 72 | if detail_resp: 73 | company_portrait = json.loads(detail_resp) 74 | # 公司详情补充信息 75 | if company_portrait.get("state") == 'ok': 76 | self.EntityHelper.__additional__(company_portrait.get("data", {}), company_entity) 77 | 78 | shareholder_request_body = { 79 | "graphId": company.get("id"), 80 | "hkVersion": 1, 81 | "typeList": { 82 | "shareHolder": { 83 | "pageNum": 1, 84 | "pageSize": 20, 85 | "required": "true" 86 | } 87 | } 88 | } 89 | """ 股东信息 """ 90 | shareholder_resp = Request(TycShareholderPostApi, method='post', json=shareholder_request_body, headers=REQUEST_HEADERS).data 91 | if shareholder_resp: 92 | company_shareholder = json.loads(shareholder_resp) 93 | # 公司详情补充信息 94 | if company_shareholder.get("state") == 'ok': 95 | self.EntityHelper.__shareholder__(company_shareholder.get("data", {}).get("shareHolder", {}), company_entity) 96 | 97 | manager_request_body = { 98 | "graphId": company.get("id"), 99 | "hkVersion": 1, 100 | "typeList": { 101 | "companyStaff": { 102 | "pageNum": 1, 103 | "pageSize": 20, 104 | "required": "true" 105 | } 106 | } 107 | } 108 | """ 高管信息 """ 109 | manager_resp = Request(TycEnterpriseManagerPostApi, method='post', json=manager_request_body, headers=REQUEST_HEADERS).data 110 | if manager_resp: 111 | company_manager = json.loads(manager_resp) 112 | # 公司详情补充信息 113 | if company_manager.get("state") == 'ok': 114 | self.EntityHelper.__company_manager__(company_manager.get("data", {}).get("companyStaff", {}), company_entity) 115 | self.companies.append(company_entity) 116 | 117 | class EntityHelper: 118 | @staticmethod 119 | def __basic_info__(src: dict, target: Company): 120 | # 公司外部系统ID 121 | target.id = src.get('id', '-') 122 | # 公司名称 123 | target.name = src.get('name', '-').replace('', '').replace('', '') 124 | # 公司简称 125 | target.short_name = src.get('alias', '-') 126 | # 公司法人 127 | target.representative = src.get('legalPersonName', '-') 128 | # 公司成立时间 129 | target.found_time = src.get('estiblishTime', '-')[0:10] 130 | # 公司地址 131 | target.company_address = src.get('regLocation', '-') 132 | # 公司注册地址 133 | target.register_address = src.get('regLocation', '-') 134 | # 公司所在省份,例:浙江,北京,广东 135 | target.province = src.get('base', '-') 136 | # 公司所在市 137 | target.city = src.get('city', '-') 138 | # 公司所在区 139 | target.district = src.get('district', '-') 140 | # 公司经营状态 141 | target.biz_status = src.get('regStatus', '-') 142 | # 公司地址经纬度坐标 143 | target.geoloc = str({ 144 | 'latitude': src.get('latitude', '-'), 145 | 'longitude': src.get('longitude', '-') 146 | }) 147 | # 公司邮箱列表 148 | target.emails = src.get('emails', ['-']).split(';')[0].replace('\t', '') 149 | # 公司联系方式列表 150 | target.phones = src.get('phoneList', []) 151 | # 公司联系方式 152 | target.contact = src.get('phoneNum', '-') 153 | # 公司经营范围 154 | target.biz_scope = src.get('businessScope', '-') 155 | # 公司类型 156 | target.company_type = src.get('companyOrgType', '-').replace('\t', '') 157 | # 公司质量分数 158 | target.score = src.get('orginalScore', 0) 159 | # 公司注册资本 160 | target.register_capital = src.get('regCapital', '-') 161 | # 公司统一社会信用代码 162 | target.credit_code = src.get('creditCode', '-') 163 | # 公司纳税号 164 | target.taxpayer_code = src.get('taxCode') 165 | if not target.taxpayer_code: 166 | target.taxpayer_code = target.credit_code 167 | # 公司注册号 168 | target.register_code = src.get('regNumber', '-') 169 | # 公司组织机构代码 170 | target.organization_code = src.get('orgNumber', '-') 171 | # 公司标签列表 172 | target.tags = src.get('labelListV2', []) 173 | # 公司行业分类 174 | target.industry = src.get('categoryStr', '-') 175 | 176 | @staticmethod 177 | def __another_info__(brand_and_agency: dict, company: Company): 178 | # 公司融资轮次 179 | company.financing_round = brand_and_agency.get("round", "未知") 180 | # 公司竟品信息 181 | company.competitions = brand_and_agency.get("jingpinName", []) 182 | # 公司logo 183 | company.logo = brand_and_agency.get("logo") 184 | # 公司简介 185 | company.company_desc = brand_and_agency.get("intro") 186 | 187 | @staticmethod 188 | def __additional__(src: dict, company: Company): 189 | # 公司英文名 190 | company.english_name = src.get('property3') 191 | if not company.english_name: 192 | company.english_name = src.get('nameEn', '-') 193 | # 公司注册机构 194 | company.register_institute = src.get('regInstitute', '-') 195 | # 公司网站地址集 196 | company.websites = src.get('websiteList', '-') 197 | # 公司实缴资本 198 | company.actual_capital = src.get('actualCapital', '缺省') 199 | # 公司曾用名 200 | company.used_name = src.get('historyNames', '-') 201 | # 公司员工人数 202 | company.staffs = src.get('socialStaffNum', None) 203 | if not company.staffs: 204 | company.staffs = src.get('staffNum', 1) 205 | # 公司纳税地址 206 | company.tax_address = src.get('taxAddress', None) 207 | if not company.tax_address: 208 | company.tax_address = src.get('regLocation', '-') 209 | # 公司纳税银行 210 | company.taxpayer_bank = src.get('taxBankName', '-') 211 | # 公司涉足领域标签 212 | company.portraits = src.get('portray', []) 213 | if not company.logo: 214 | company.logo = src.get('logo') 215 | if not company.company_desc: 216 | company.company_desc = src.get('baseInfo', '-') 217 | 218 | @staticmethod 219 | def __shareholder__(src: dict, company: Company): 220 | holder_list = src.get("holderList", []) 221 | for holder in holder_list: 222 | if holder: 223 | shareholder = CompanyShareholder() 224 | shareholder.name = holder.get("name") 225 | shareholder.alias = holder.get("alias") 226 | shareholder.avatar = holder.get("logo") 227 | shareholder.control_ratio = holder.get("proportion") 228 | shareholder.tags = [tag.get("name") for tag in holder.get("tagList", [])] 229 | company.shareholders.append(shareholder) 230 | 231 | @staticmethod 232 | def __company_manager__(src: dict, company: Company): 233 | manager_list = src.get("result", []) 234 | manager_type = src.get("staffTitle", "-") 235 | for manager in manager_list: 236 | company_manager = CompanyManager() 237 | company_manager.manager_type = manager_type 238 | company_manager.name = manager.get("name", "-") 239 | company_manager.titles = manager.get("typeJoin", []) 240 | company.managers.append(company_manager) 241 | 242 | -------------------------------------------------------------------------------- /tianyancha/crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: albert 5 | :date: 03/08/2019 6 | """ 7 | import logging 8 | from tianyancha.client import TycClient 9 | from db.mysql_connector import * 10 | 11 | 12 | def start(): 13 | """ 入口函数 """ 14 | def __printall(items): 15 | for elem in items: 16 | logging.info(elem.__str__()) 17 | 18 | keys = globals().get('keywords', []) 19 | for key in keys: 20 | logging.info('正在采集[%s]...' % key) 21 | companies = TycClient().search(key).companies 22 | # 写入db 23 | # insert_company(companies) 24 | __printall(companies) 25 | logging.info("completed") 26 | 27 | 28 | def load_keys(keys: list): 29 | globals().setdefault('keywords', keys) 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /tianyancha/tyc_rest_api.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import logging 5 | 6 | from uplink import * 7 | 8 | AUTHORIZATION = '0###oo34J0VKzLlpdvf8kgFkMlfU_IPY###1642087379312###22494f3155c2e5a4be76e503837fa439' 9 | """ 请求token """ 10 | X_AUTH_TOKEN = "eyJkaXN0aW5jdF9pZCI6IjE3ZDFjNWVhMzZjNGY2LTA5ZjU2NWUwNWViNTZjLTFjMzA2ODUxLTIwNzM2MDAtMTdkMWM1ZWEzNmRiMzYiLCJsaWIiOnsiJGxpYiI6ImpzIiwiJGxpYl9tZXRob2QiOiJjb2RlIiwiJGxpYl92ZXJzaW9uIjoiMS4xNS4yNCJ9LCJwcm9wZXJ0aWVzIjp7IiR0aW1lem9uZV9vZmZzZXQiOi00ODAsIiRzY3JlZW5faGVpZ2h0IjoxMDgwLCIkc2NyZWVuX3dpZHRoIjoxOTIwLCIkbGliIjoianMiLCIkbGliX3ZlcnNpb24iOiIxLjE1LjI0IiwiJGxhdGVzdF90cmFmZmljX3NvdXJjZV90eXBlIjoi6Ieq54S25pCc57Si5rWB6YePIiwiJGxhdGVzdF9zZWFyY2hfa2V5d29yZCI6IuacquWPluWIsOWAvCIsIiRsYXRlc3RfcmVmZXJyZXIiOiJodHRwczovL3d3dy5nb29nbGUuY29tLyIsImN1cnJlbnRfdXJsIjoiaHR0cHM6Ly93d3cudGlhbnlhbmNoYS5jb20vc2VhcmNoP2tleT0lRTYlOUQlQUQlRTUlQjclOUUlRTYlOTklQUUlRTUlODUlQjQlRTQlQkMlODElRTQlQjglOUElRTclQUUlQTElRTclOTAlODYlRTUlOTAlODglRTQlQkMlOTklRTQlQkMlODElRTQlQjglOUEiLCJyZWZlcnJlciI6Imh0dHBzOi8vd3d3LnRpYW55YW5jaGEuY29tL3NlYXJjaD9rZXk9JUU2JTlEJUFEJUU1JUI3JTlFJUU2JTk5JUFFJUU1JTg1JUI0JUU0JUJDJTgxJUU0JUI4JTlBJUU3JUFFJUExJUU3JTkwJTg2JUU1JTkwJTg4JUU0JUJDJTk5JUU0JUJDJTgxJUU0JUI4JTlBIiwidHljaWQiOiI0MmMxZTY1MDQ0ZjYxMWVjYmIxZDY3ZmJiYzEwN2U3NSIsIm5hbWUiOiLmna3lt57mma7lhbTkvIHkuJrnrqHnkIblkIjkvJnkvIHkuJoiLCJtb2R1bGUiOiLkvJjotKjlrp7lkI3orqTor4EiLCIkaXNfZmlyc3RfZGF5IjpmYWxzZX0sImFub255bW91c19pZCI6IjE3ZDFjNWVhMzZjNGY2LTA5ZjU2NWUwNWViNTZjLTFjMzA2ODUxLTIwNzM2MDAtMTdkMWM1ZWEzNmRiMzYiLCJ0eXBlIjoidHJhY2siLCJldmVudCI6InNlYXJjaF9yZXN1bHRfZXhwdXJlIiwiX3RyYWNrX2lkIjo3MjUyNDM3Mjd9" 11 | 12 | 13 | def _response_handler(resp): 14 | """ 15 | API接口响应参数处理器 16 | :return: 17 | """ 18 | pass 19 | 20 | 21 | def _error_handler(exc_type, exc_val, exc_tb): 22 | """ 23 | API错误响应处理器 24 | :return: 25 | """ 26 | logging.info('type: ' + exc_type) 27 | logging.info('val: ' + exc_val) 28 | logging.info('tb: ' + exc_tb) 29 | 30 | 31 | @error_handler(_error_handler) 32 | @response_handler(_response_handler) 33 | @headers({ 34 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36", 35 | "version": "TYC-XCX-WX", 36 | "Host": "api9.tianyancha.com", 37 | "Authorization": AUTHORIZATION, 38 | 'x-auth-token': X_AUTH_TOKEN, 39 | }) 40 | class TianyanchaBasicInfo(Consumer): 41 | """ 42 | 企业基本数据 43 | """ 44 | def __init__(self, base_url="", client=None, converters=(), auth=None, hooks=(), **kwargs): 45 | if not base_url: 46 | base_url = "https://api9.tianyancha.com" 47 | super().__init__(base_url, client, converters, auth, hooks, **kwargs) 48 | 49 | @returns.json 50 | @get("/services/v3/search/sNorV3/{q}") 51 | def list_by_page(self, keyword: Path("q"), page_num: Query("pageNum"), page_size: Query("pageSize"), sort_type: Query("sortType")): 52 | """ 53 | 根据关键字查询企业信息分页列表 54 | :param keyword: 55 | :param page_num: 56 | :param page_size: 57 | :param sort_type: 58 | :return: 59 | """ 60 | 61 | @returns.json 62 | @get("/services/v3/t/common/baseinfoV5/{enterpriseId}") 63 | def get_enterprise_detail(self, enterprise_id: Path("enterpriseId")): 64 | """ 65 | 查询企业信息详情 66 | :param enterprise_id: 67 | :return: 68 | """ 69 | 70 | @returns.json 71 | @get("/services/v3/risk/companyRiskInfoV4") 72 | def get_enterprise_business_risk(self, enterprise_id: Query("id")): 73 | """ 74 | 查询企业经营风险信息 75 | :param enterprise_id: 76 | :return: 77 | """ 78 | 79 | 80 | @error_handler(_error_handler) 81 | @response_handler(_response_handler) 82 | @headers({ 83 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36", 84 | "version": "TYC-XCX-WX", 85 | "Host": "capi.tianyancha.com", 86 | "Authorization": AUTHORIZATION, 87 | 'x-auth-token': X_AUTH_TOKEN, 88 | }) 89 | class TianyanchaDimensional(Consumer): 90 | """ 91 | 企业维度数据 92 | """ 93 | def __init__(self, base_url="", client=None, converters=(), auth=None, hooks=(), **kwargs): 94 | if not base_url: 95 | base_url = "https://capi.tianyancha.com" 96 | super().__init__(base_url, client, converters, auth, hooks, **kwargs) 97 | 98 | def get_enterprise_shareholder_list(self, enterprise_id, page_num, page_size): 99 | """ 100 | 查询企业股东信息 101 | :param enterprise_id: 102 | :param page_size: 103 | :param page_num: 104 | :return: 105 | """ 106 | body = { 107 | "graphId": enterprise_id, 108 | "hkVersion": 1, 109 | "typeList": { 110 | "shareHolder": { 111 | "pageNum": page_num, 112 | "pageSize": page_size, 113 | "required": "true" 114 | } 115 | } 116 | } 117 | return self.__get_enterprise_shareholder_list(body) 118 | 119 | def get_enterprise_manager_list(self, enterprise_id, page_num, page_size): 120 | """ 121 | 查询企业高管信息 122 | :param enterprise_id: 123 | :param page_num: 124 | :param page_size: 125 | :return: 126 | """ 127 | req_body = { 128 | "graphId": enterprise_id, 129 | "hkVersion": 1, 130 | "typeList": { 131 | "companyStaff": { 132 | "pageNum": page_num, 133 | "pageSize": page_size, 134 | "required": "true" 135 | } 136 | } 137 | } 138 | return self.__get_enterprise_manager_list(req_body) 139 | 140 | @returns.json 141 | @post("/cloud-facade/company/familyBucket") 142 | def __get_enterprise_shareholder_list(self, **request_body: Body): 143 | """ 144 | 查询企业股东信息 145 | :param request_body: 146 | :return: 147 | """ 148 | 149 | @returns.json 150 | @post("/cloud-facade/company/familyBucket") 151 | def __get_enterprise_manager_list(self, **request_body: Body): 152 | """ 153 | 查询企业高管 154 | :param request_body: 155 | :return: 156 | """ 157 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: albert 5 | :date: 02/28/2019 6 | :desc: 7 | """ 8 | import sys 9 | sys.path.append('..') 10 | -------------------------------------------------------------------------------- /util/date.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: lubosson 5 | :date: 2019-04-11 6 | :desc: 7 | """ 8 | import datetime 9 | 10 | 11 | def datetime2timestamp(pytime: datetime.datetime.now()): 12 | ts = pytime.timestamp() * 1000 13 | return int(ts) 14 | 15 | 16 | def timestamp2datetime(timestamp: int): 17 | date = datetime.datetime.fromtimestamp(timestamp / 1000) 18 | return date 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /util/httpclient.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: albert 5 | :date: 02/28/2019 6 | :desc: http请求工具类 7 | """ 8 | import logging 9 | 10 | import requests 11 | 12 | from config import GLOBAL_PROXY, PROXY_POOL_URL 13 | 14 | 15 | class Request: 16 | def __init__(self, url, method=None, params=None, proxy=True, **kwargs): 17 | self.proxy = proxy 18 | self.url = url 19 | self.params = params 20 | self.data = None 21 | self.method = method 22 | if self.method == 'post': 23 | self.post(**kwargs) 24 | else: 25 | self.get(**kwargs) 26 | 27 | def get(self, **kwargs): 28 | p = proxy() if GLOBAL_PROXY and self.proxy else None 29 | resp = requests.get(self.url, params=self.params, verify=False, proxies=p, **kwargs) 30 | if resp and resp.status_code == 200: 31 | self.data = resp.text 32 | else: 33 | logging.warning(resp) 34 | 35 | def post(self, **kwargs): 36 | p = proxy() if GLOBAL_PROXY and self.proxy else None 37 | resp = requests.post(self.url, verify=False, proxies=p, **kwargs) 38 | if resp and resp.status_code == 200: 39 | self.data = resp.text 40 | else: 41 | logging.warning(resp) 42 | 43 | 44 | def proxy(): 45 | import json 46 | r = requests.get(f"{PROXY_POOL_URL}/get") 47 | if r and r.status_code == 200: 48 | p = json.loads(r.text) 49 | if p['https']: 50 | return {"http": "https://%s" % p.get("proxy")} 51 | else: 52 | return {"http": "http://%s" % p.get("proxy")} 53 | 54 | 55 | if __name__ == '__main__': 56 | print(proxy()) 57 | 58 | -------------------------------------------------------------------------------- /util/log.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-: coding: utf-8 -*- 3 | """ 4 | :author: lubosson 5 | :date: 2019-04-11 6 | :desc: 7 | """ 8 | import logging 9 | import os 10 | from logging.handlers import TimedRotatingFileHandler 11 | 12 | 13 | def set_file(filename): 14 | logger = logging.getLogger() 15 | os.getcwd() 16 | handler = TimedRotatingFileHandler(filename, 'D', 1, 7) 17 | fmt = '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s' 18 | formatter = logging.Formatter(fmt=fmt, datefmt='%m/%d/%Y %H:%M:%S') 19 | 20 | handler.setFormatter(formatter) 21 | handler.setLevel(logging.INFO) 22 | # 屏幕输出 23 | console = logging.StreamHandler() 24 | console.setFormatter(formatter) 25 | console.setLevel(logging.INFO) 26 | logger.addHandler(console) 27 | logger.addHandler(handler) 28 | logger.setLevel(logging.INFO) 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /util/wechat_auth.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # @author lubosson 4 | # @since 2020-08-06 5 | # @description -- 6 | import threading 7 | 8 | 9 | def auth_token() -> str: 10 | return "token" 11 | 12 | 13 | class WeChatAuthTask(threading.Thread): 14 | def __init__(self, func): 15 | super(WeChatAuthTask, self).__init__() 16 | self.func = func 17 | self.result = self.func 18 | 19 | def get(self): 20 | threading.Thread.join(self) 21 | try: 22 | return self.result 23 | except Exception: 24 | return None 25 | 26 | def run(self) -> str: 27 | return "token" 28 | 29 | --------------------------------------------------------------------------------