├── Dockerfile ├── README.md ├── conf.py ├── requirements.txt ├── 工商信息查询.py └── 看准-企业工商信息查询.py /Dockerfile: -------------------------------------------------------------------------------- 1 | # 基于镜像基础 2 | FROM python:3.10.1 3 | 4 | # 设置时区 5 | ENV TZ Asia/Shanghai 6 | 7 | # 设置代码文件夹工作目录 /app 8 | WORKDIR /app 9 | 10 | # 复制当前代码文件到容器中 /app 11 | ADD . /app 12 | 13 | # 安装所需的包 14 | RUN pip install -r requirements.txt -i https://pypi.doubanio.com/simple/ 15 | 16 | CMD ["gunicorn", "-c", "conf.py", "看准-企业工商信息查询:app"] 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 企业工商信息查询接口 2 | 3 | 企业工商信息接口(包含天眼查、企查查、爱企查、国家企业公示系统平台、快准) 4 | 5 | 接口文档(http://127.0.0.1:8081/docs) 6 | 7 | tip:代理设置(158行更换) 8 | 9 | 项目运行 10 | 11 | `pip install requirements.txt -i https://pypi.doubanio.com/simple/` 12 | 13 | ` uvicorn 工商信息查询:app --host 0.0.0.0 --port 8081 --reload` 14 | 15 | docker 运行 16 | 17 | `docker build -t businessinfo https://github.com/Litre-WU/businessInfo-api.git ` 18 | 19 | `docker run --name businessInfo -d -p 8081:8081 businessinfo` 20 | -------------------------------------------------------------------------------- /conf.py: -------------------------------------------------------------------------------- 1 | # vim: set fileencoding:utf-8 2 | # -*- coding: utf-8 -*- 3 | # Author: Litre WU 4 | # E-mail: litre-wu@tutanota.com 5 | # Software: PyCharm 6 | # File: gunicorn.py 7 | # Time: 4月 23, 2021 8 | import logging 9 | import logging.handlers 10 | from logging.handlers import WatchedFileHandler 11 | import os 12 | import multiprocessing 13 | 14 | # chdir = '/app' # 加载应用程序之前将chdir目录指定到指定目录 15 | 16 | proc_name = 'businessInfo' # 进程名 17 | 18 | bind = '0.0.0.0:8081' # 绑定ip和端口号 19 | 20 | backlog = 512 # 监听队列 21 | 22 | timeout = 10 # 超时 23 | 24 | # worker_class = 'gevent' # 默认的是sync模式 25 | # worker_class = 'uvicorn.workers.UvicornWorker' # 使用uvicorn模式 26 | worker_class = 'uvicorn.workers.UvicornH11Worker' # 使用纯python模式 27 | 28 | # workers = multiprocessing.cpu_count() * 2 + 1 # 进程数 29 | # workers = multiprocessing.cpu_count() * 2 + 1 # 进程数 30 | workers = 4 # 进程数 31 | 32 | threads = 4 # 指定每个进程开启的线程数 33 | 34 | # deamon = True # 守护进程 35 | 36 | reload = True # 自动加载 37 | 38 | worker_connections = 2000 # 设置最大并发量 39 | 40 | loglevel = 'info' # 日志级别,这个日志级别指的是错误日志的级别,而访问日志的级别无法设置 41 | 42 | # accesslog = "/businessInfo/logs/demo_access.log" # 访问日志文件, "-" 表示标准输出 43 | 44 | # errorlog = "/businessInfo/logs/demo_err.log" # 错误日志文件, "-" 表示标准输出 45 | 46 | # access_log_format = '%(h)s %(l)s %(u)s %(t)s' 47 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp 2 | fastapi 3 | user-agent 4 | lxml 5 | uvicorn 6 | gunicorn 7 | pandas 8 | python-multipart 9 | brotlipy 10 | loguru 11 | boltons 12 | beautifulsoup4 13 | -------------------------------------------------------------------------------- /工商信息查询.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: Litre WU 3 | # E-mail: litre-wu@tutanota.com 4 | # Software: PyCharm 5 | # File: 工商信息查询.py 6 | # Time: 4月 21, 2021 7 | import asyncio 8 | from typing import Optional, List 9 | from fastapi import FastAPI, Header, Cookie, Depends, BackgroundTasks 10 | from starlette.requests import Request 11 | from pydantic import BaseModel, Field 12 | from fastapi.responses import JSONResponse 13 | import aiohttp 14 | from user_agent import generate_user_agent 15 | from lxml import etree 16 | import pandas as pd 17 | import json 18 | import time 19 | from random import randint, sample 20 | import os 21 | from json import load, dump 22 | import socket 23 | from sys import platform 24 | from functools import lru_cache 25 | from loguru import logger 26 | from boltons.cacheutils import LRI, LRU 27 | from hashlib import md5 28 | 29 | lri_cache = LRI(max_size=100) 30 | lru_cache = LRU(max_size=100) 31 | 32 | logger.add(f'{os.path.basename(__file__)[:-3]}.log', rotation='200 MB', compression='zip', enqueue=True, serialize=False, encoding='utf-8', retention='7 days') 33 | 34 | 35 | host = socket.gethostbyname(socket.gethostname()) 36 | 37 | if platform == "win32": 38 | asyncio.set_event_loop(asyncio.ProactorEventLoop()) 39 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) 40 | 41 | tags_metadata = [ 42 | { 43 | "name": "企业工商信息查询接口", 44 | "description": "企业工商信息查询(天眼查、企查查、爱企查、国家企业公示系统)", 45 | "externalDocs": { 46 | "description": "More", 47 | "url": f"http://{host}/docs", 48 | }, 49 | }, 50 | ] 51 | 52 | contact = { 53 | "name": "Litre", 54 | "url": "http://121.37.209.113", 55 | "email": "litre-wu@tutanota.com", 56 | } 57 | 58 | app = FastAPI(openapi_url="/api/v1/api.json", title="企业工商信息查询接口", contact=contact, openapi_tags=tags_metadata) 59 | 60 | 61 | # 日志 62 | async def log(request, **kwargs): 63 | ritems = dict(request.items()) 64 | if not kwargs: kwargs = "" 65 | log_info = f'{ritems["client"][0]} {ritems["method"]} {ritems["path"]} {ritems["type"]}/{ritems["http_version"]} {kwargs}' 66 | logger.info(log_info) 67 | 68 | 69 | # 首页 70 | @app.get("/", tags=["首页"]) 71 | async def index(request: Request, user_agent: Optional[str] = Header(None), x_token: List[str] = Header(None), ): 72 | result = { 73 | "code": 200, 74 | "msg": "来了!老弟", 75 | "result": "你看这个面它又长又宽,就像这个碗它又大又圆", 76 | "info": { 77 | "openapi_url": "/api/v1/openapi.json", 78 | "ip": request.client.host, 79 | "x-token": x_token, 80 | "user-agent": user_agent, 81 | "headers": dict(request.headers) 82 | } 83 | } 84 | return JSONResponse(result) 85 | 86 | 87 | class Qcc(BaseModel): 88 | key: str = Field(..., example='哔哩哔哩') 89 | creditCode: str = Field(..., example='统一社会信用代码(暂不使用)') 90 | 91 | 92 | @app.post("/", tags=["企业工商信息查询接口"]) 93 | async def api(data: Qcc, request: Request, background_tasks: BackgroundTasks, x_token: List[str] = Header(None), 94 | user_agent: Optional[str] = Header(None)): 95 | kwargs = data.dict() 96 | await log(request, **kwargs) 97 | key = md5(str(kwargs).encode()).hexdigest() 98 | if lru_cache.get(key): return lru_cache[key] 99 | result = await query(**kwargs) 100 | if result: lru_cache[key] = result 101 | return JSONResponse(result) 102 | 103 | 104 | # 公共请求函数 105 | async def pub_req(**kwargs): 106 | if not kwargs.get("url", ""): return None 107 | headers ={**{ 108 | "X-Forwarded-For": ip, 109 | "X-Forwarded": ip, 110 | "Forwarded-For": ip, 111 | "Forwarded": ip, 112 | "X-Forwarded-Proto": ip, 113 | "X-Forwarded-Host": ip, 114 | "X-Requested-With": ip, 115 | "X-Client-IP": ip, 116 | "X-remote-IP": ip, 117 | "X-remote-addr": ip, 118 | "X-Real-IP": ip, 119 | "True-Client-IP": ip, 120 | "Client-IP": ip, 121 | "X_FORWARDED_FOR": ip, 122 | "X_REAL_IP": ip, 123 | "User-Agent": generate_user_agent() 124 | }, **kwargs.get("headers", {})} 125 | try: 126 | # aiohttp 127 | async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=10), 128 | connector=aiohttp.TCPConnector(ssl=False), trust_env=True) as client: 129 | proxy_auth = aiohttp.BasicAuth(kwargs.get("proxy_user", ""), kwargs.get("proxy_pass", "")) 130 | async with client.request(method=kwargs.get("method", "GET"), url=kwargs["url"], 131 | params=kwargs.get("params", {}), 132 | data=kwargs.get("data", {}), headers=headers, proxy=kwargs.get("proxy", ""), 133 | proxy_auth=proxy_auth, 134 | timeout=kwargs.get("timeout", 5)) as rs: 135 | if rs.status == 200: 136 | result = await rs.read() 137 | return result 138 | else: 139 | logger.info(f"pub_req {kwargs} {rs.status} {rs.text}") 140 | time.sleep(randint(1, 2)) 141 | retry = kwargs.get("retry", 0) 142 | retry += 1 143 | if retry >= 2: 144 | return None 145 | kwargs["retry"] = retry 146 | return await pub_req(**kwargs) 147 | except Exception as e: 148 | logger.info(f"pub_req {kwargs} {e}") 149 | time.sleep(randint(1, 2)) 150 | retry = kwargs.get("retry", 0) 151 | retry += 1 152 | if retry >= 2: 153 | return None 154 | kwargs["retry"] = retry 155 | return await pub_req(**kwargs) 156 | 157 | 158 | # 代理 159 | async def get_proxy(**kwargs): 160 | if not kwargs.get("turn", 0): 161 | time_now = int(time.time()) 162 | if not os.path.exists('proxy.json'): 163 | with open('proxy.json', 'w') as f: 164 | dump([], f) 165 | with open('proxy.json', 'r') as f: 166 | data = json.load(f) 167 | if data: 168 | expire_time = int(time.mktime(time.strptime(data[0]["expire_time"], "%Y-%m-%d %H:%M:%S"))) 169 | if time_now < expire_time: 170 | return data 171 | # # 番茄代理 172 | # url = 'http://x.fanqieip.com/gip' 173 | # params = {"getType": "3","qty": "1","port": "1","time": "1","city": "0","format": "2","ss": "1","dt": "1","css": ""} 174 | # 芝麻代理 175 | url = 'http://webapi.http.zhimacangku.com/getip' 176 | params = {"num": "1", "type": "2", "pro": "0", "city": "0", "yys": "0", "port": "1", "time": "1", "ts": "1", 177 | "ys": "0", "cs": "0", "lb": "1", "sb": "0", "pb": "4", "mr": "1", "regions": ""} 178 | try: 179 | meta = { 180 | "url": url, 181 | "params": params, 182 | } 183 | result = await pub_req(**meta) 184 | logger.info(result.decode()) 185 | if not result: return None 186 | result = json.loads(result) 187 | if result.get("data", ""): 188 | with open('proxy.json', 'w') as f: 189 | json.dump(result["data"], f) 190 | return result["data"] 191 | else: 192 | time.sleep(randint(0, 1)) 193 | retry = kwargs.get("retry", 0) 194 | retry += 1 195 | if retry >= 2: 196 | return None 197 | kwargs["retry"] = retry 198 | return await get_proxy(**kwargs) 199 | except Exception as e: 200 | logger.info(e) 201 | retry = kwargs.get("retry", 0) 202 | retry += 1 203 | if retry >= 2: 204 | return None 205 | kwargs["retry"] = retry 206 | return await get_proxy(**kwargs) 207 | 208 | 209 | # IP查询 210 | async def query_ip(**kwargs): 211 | url = 'http://httpbin.org/get?show_env=1' 212 | try: 213 | meta = { 214 | "url": url, 215 | "proxy": kwargs.get("proxy", ""), 216 | "proxy_user": kwargs.get("proxy_user", ""), 217 | "proxy_pass": kwargs.get("proxy_pass", ""), 218 | } 219 | result = await pub_req(**meta) 220 | if not result: return None 221 | result = json.loads(result) 222 | # logger.info(result) 223 | ip = result["origin"].split()[0] 224 | return ip 225 | except Exception as e: 226 | logger.info(f'query_ip {e}') 227 | time.sleep(randint(1, 2)) 228 | retry = kwargs.get("retry", 0) 229 | retry += 1 230 | if retry >= 2: 231 | return None 232 | kwargs["retry"] = retry 233 | return await query_ip(**kwargs) 234 | 235 | 236 | # 查询 237 | async def query(**kwargs): 238 | result = await qcc(**kwargs) 239 | result = await tyc(**kwargs) if not result else result 240 | result = await aqc(**kwargs) if not result else result 241 | result = await gsxt(**kwargs) if not result else result 242 | if result: 243 | result = {"code": 200, "msg": "OK", "result": result} 244 | else: 245 | retry = kwargs.get("retry", 0) 246 | retry += 1 247 | kwargs["retry"] = retry 248 | if retry == 1: 249 | # 第一次代理 250 | proxy = await get_proxy() 251 | if proxy: 252 | kwargs = kwargs | {"proxy": f'http://{proxy[0]["ip"]}:{proxy[0]["port"]}'} 253 | return await query(**kwargs) 254 | else: 255 | kwargs = kwargs | {"proxy": ""} 256 | return await query(**kwargs) 257 | if retry > 2: 258 | return {"code": 200, "msg": "Fail", "result": None} 259 | # 第二次更换代理 260 | proxy = await get_proxy(**{"turn": 1}) 261 | if proxy: 262 | kwargs = kwargs | {"proxy": f'http://{proxy[0]["ip"]}:{proxy[0]["port"]}'} 263 | else: 264 | kwargs = kwargs | {"proxy": ""} 265 | return await query(**kwargs) 266 | return result 267 | 268 | 269 | # 天眼查 270 | async def tyc(**kwargs): 271 | try: 272 | meta = { 273 | "url": "https://m.tianyancha.com/search", 274 | "params": {"key": kwargs.get("key", "")}, 275 | "headers": {"Referer": "https://m.tianyancha.com"}, 276 | "proxy": kwargs.get("proxy", ""), 277 | "proxy_user": kwargs.get("proxy_user", ""), 278 | "proxy_pass": kwargs.get("proxy_user", ""), 279 | } 280 | result = await pub_req(**meta) 281 | if not result: return None 282 | html = result.decode() 283 | ids = etree.HTML(html).xpath('//div[@class="search-company-item"]/@onclick') 284 | if not ids: return None 285 | ids = [x.strip("jumpToCompany('").strip("');") for x in ids] 286 | tasks = [asyncio.create_task(tyc_detail(**{"id": ids[i], "proxy": kwargs.get("proxy", "")})) for i in 287 | range(len(ids))] 288 | result = await asyncio.gather(*tasks) 289 | return [x for x in result if x] 290 | except Exception as e: 291 | logger.info(f'tyc {e}') 292 | retry = kwargs.get("retry", 0) 293 | retry += 1 294 | if retry >= 2: 295 | return None 296 | kwargs["retry"] = retry 297 | return await tyc(**kwargs) 298 | 299 | 300 | # 天眼查详情 301 | async def tyc_detail(**kwargs): 302 | _id = kwargs.get("id", "") 303 | if not _id: return None 304 | try: 305 | meta = { 306 | "url": f'https://m.tianyancha.com/company/{_id}', 307 | "headers": { 308 | "Referer": "https://m.tianyancha.com/search", 309 | }, 310 | "proxy": kwargs.get("proxy", ""), 311 | "proxy_user": kwargs.get("proxy_user", ""), 312 | "proxy_pass": kwargs.get("proxy_pass", ""), 313 | } 314 | result = await pub_req(**meta) 315 | if not result: return None 316 | html = result.decode() 317 | divs = etree.HTML(html).xpath('//div[@class="content"]/div[@class="divide-content"]/div') 318 | info = [x.xpath('div//text()') for x in divs] if divs else "" 319 | data = {} 320 | if not info: 321 | retry = kwargs.get("retry", 0) 322 | retry += 1 323 | if retry >= 2: 324 | return None 325 | kwargs["retry"] = retry 326 | return await tyc_detail(**kwargs) 327 | for x in info: 328 | if "法定代表人" in x: 329 | if len(x) == 2: 330 | data[x[0]] = x[1] 331 | else: 332 | data[x[0]] = x[2] 333 | elif "经营范围" in x: 334 | data[x[0]] = x[1] 335 | else: 336 | if len(x) > 3: 337 | for i in range(0, len(x), 2): 338 | data[x[i]] = x[i + 1] 339 | else: 340 | data[x[0]] = x[1] 341 | result = { 342 | "social_credit_code": data.get("统一社会信用代码", ""), 343 | "name_cn": etree.HTML(html).xpath('//meta[@name="tyc-wx-title"]/@content')[0], 344 | "legal_person": data.get("法定代表人", ""), 345 | "status": data.get("经营状态", ""), 346 | "found_date": data.get("成立日期", ""), 347 | "registered_capital": data.get("注册资本", ""), 348 | "really_capital": data.get("实缴资本", ""), 349 | "issue_date": data.get("核准日期", ""), 350 | "organization_code": data.get("组织机构代码", ""), 351 | "regist_code": data.get("工商注册号", ""), 352 | "taxpayer_code": data.get("纳税人识别号", ""), 353 | "type": data.get("企业类型", ""), 354 | "license_start_date": data.get("营业期限", ""), 355 | "taxpayer_crop": data.get("纳税人资质", ""), 356 | "industry_involved": data.get("行业", ""), 357 | "province": data.get("所属地区", ""), 358 | "regist_office": data.get("登记机关", ""), 359 | "staff_size": data.get("人员规模", ""), 360 | "insured_size": data.get("参保人数", ""), 361 | "transformer_name": data.get("曾用名", ""), 362 | "name_en": data.get("英文名称", ""), 363 | "imp_exp_enterprise_code": data.get("进出口企业代码", ""), 364 | "address": data.get("注册地址", ""), 365 | "regist_address": data.get("注册地址", ""), 366 | "business_scope": data.get("经营范围", ""), 367 | "email": "", 368 | "unit_phone": "", 369 | "fax": "", 370 | "website": "" 371 | } 372 | if result.get("license_start_date", ""): 373 | result["license_start_date"], result["license_end_date"] = result["license_start_date"].split( 374 | "至") 375 | else: 376 | result["license_start_date"], result["license_end_date"] = "", "" 377 | return result 378 | except Exception as e: 379 | logger.info(f'tyc_detail {e}') 380 | retry = kwargs.get("retry", 0) 381 | retry += 1 382 | if retry >= 2: 383 | return None 384 | kwargs["retry"] = retry 385 | return await tyc_detail(**kwargs) 386 | 387 | 388 | # 企查查 389 | async def qcc(**kwargs): 390 | try: 391 | meta = { 392 | "url": "https://www.qcc.com/web/search", 393 | "params": {"key": kwargs.get("key", "")}, 394 | "headers": { 395 | "Cookie": "", 396 | "Referer": f'https://www.qcc.com/web/search?key={kwargs.get("key", "")}' 397 | }, 398 | "proxy": kwargs.get("proxy", ""), 399 | "proxy_user": kwargs.get("proxy_user", ""), 400 | "proxy_pass": kwargs.get("proxy_pass", ""), 401 | } 402 | result = await pub_req(**meta) 403 | if not result: return None 404 | html = result.decode() 405 | content = etree.HTML(html).xpath('//script[1]/text()') 406 | content = '{"appState' + content[0].split("appState")[1].split(";(function")[ 407 | 0] if content else "" 408 | if not content: return None 409 | result = json.loads(content) 410 | result = result["search"]["searchRes"].get("Result", "") if result else "" 411 | if not result: 412 | return None 413 | data_list = [] 414 | for r in result: 415 | data = { 416 | "keyNo": r.get("KeyNo", ""), 417 | "legal_person": r.get("OperName", "").replace("", "").replace("", ""), 418 | "email": r.get("Email", ""), 419 | "unit_phone": r.get("ContactNumber", ""), "fax": "", 420 | "address": r.get("Address", "").replace("", "").replace("", ""), 421 | "website": r.get("GW", "") 422 | } 423 | data_list.append(data) 424 | tasks = [asyncio.create_task(qcc_detail(**{"data": data_list[i], "proxy": kwargs.get("proxy", "")})) for i in 425 | range(len(data_list))] 426 | result = await asyncio.gather(*tasks) 427 | return [x for x in result if x] 428 | except Exception as e: 429 | logger.info(f'qcc {e}') 430 | retry = kwargs.get("retry", 0) 431 | retry += 1 432 | if retry >= 2: 433 | return None 434 | kwargs["retry"] = retry 435 | return await qcc(**kwargs) 436 | 437 | 438 | # 企查查企业详情 439 | async def qcc_detail(**kwargs): 440 | data = kwargs.get("data", "") 441 | if not data: return None 442 | try: 443 | meta = { 444 | # "url": f'https://www.qcc.com/firm/{data["keyNo"]}.html', 445 | "url": f'https://www.qcc.com/cbase/{data["keyNo"]}.html', 446 | # "url": f'https://m.qcc.com/firm/{data["keyNo"]}.html', 447 | "headers": { 448 | "Connection": "close", 449 | "Cookie": "", 450 | # "cookie": "acw_sc__v2=6062bdefc57536ceeeb840ffcf85497a600eef9f", 451 | "Referer": f'https://www.qcc.com/firm/{data["keyNo"]}.html' 452 | # "Referer": f'https://m.qcc.com/firm/{data["keyNo"]}.html', 453 | }, 454 | "proxy": kwargs.get("proxy", ""), 455 | "proxy_user": kwargs.get("proxy_user", ""), 456 | "proxy_pass": kwargs.get("proxy_pass", ""), 457 | } 458 | result = await pub_req(**meta) 459 | if not result: return data 460 | tables = pd.read_html(result.decode()) 461 | # logger.info(tables) 462 | info_list = [] 463 | for t in tables[0].values.tolist(): 464 | info_list += t 465 | info = {} 466 | for i, x in enumerate(info_list): 467 | if i % 2 == 0: 468 | if "复制" in x: 469 | continue 470 | info[x] = info_list[i + 1].replace("复制", "").strip() 471 | result = { 472 | "social_credit_code": info.get("统一社会信用代码", ""), 473 | "name_cn": info.get("企业名称", ""), 474 | "legal_person": info.get("法定代表人", ""), 475 | "status": info.get("登记状态", ""), 476 | "found_date": info.get("成立日期", ""), 477 | "registered_capital": info.get("注册资本", ""), 478 | "really_capital": info.get("实缴资本", ""), 479 | "issue_date": info.get("核准日期", ""), 480 | "organization_code": info.get("组织机构代码", ""), 481 | "regist_code": info.get("工商注册号", ""), 482 | "taxpayer_code": info.get("纳税人识别号", ""), 483 | "type": info.get("企业类型", ""), 484 | "license_start_date": info.get("营业期限", ""), 485 | "taxpayer_crop": info.get("纳税人资质", ""), 486 | "industry_involved": info.get("所属行业", ""), 487 | "province": info.get("所属地区", ""), 488 | "regist_office": info.get("登记机关", ""), 489 | "staff_size": info.get("人员规模", ""), 490 | "insured_size": info.get("参保人数", ""), 491 | "transformer_name": info.get("曾用名", ""), 492 | "name_en": info.get("英文名", "").split("(")[0], 493 | "imp_exp_enterprise_code": info.get("进出口企业代码", ""), 494 | "regist_address": info.get("注册地址", "").split()[0], 495 | "business_scope": info.get("经营范围", ""), 496 | } 497 | if result.get("license_start_date", ""): 498 | result["license_start_date"], result["license_end_date"] = (x.strip() for x in 499 | result["license_start_date"].split( 500 | "至")) 501 | else: 502 | result["license_start_date"], result["license_end_date"] = "", "" 503 | data.pop("keyNo") 504 | result = result | data 505 | # # web 506 | # table = etree.HTML(html).xpath('//table[@class="ntable"]')[0] if etree.HTML(html).xpath( 507 | # '//table[@class="ntable"]') else "" 508 | # if type(table) == str: 509 | # retry = kwargs.get("retry", 0) 510 | # retry += 1 511 | # if retry >= 2: 512 | # return False 513 | # kwargs["retry"] = retry 514 | # return await qcc_detail(**kwargs) 515 | # trs = table.xpath('tr') 516 | # if not trs: return None 517 | # tds = [] 518 | # for x in trs: 519 | # tds += x.xpath('td[@class="tb"]') 520 | # info = {x.xpath('text()')[0].strip(): x.xpath('following-sibling::node()/text()')[0].strip() for x 521 | # in tds if x.xpath('following-sibling::node()/text()')} 522 | # result = { 523 | # "social_credit_code": info.get("统一社会信用代码", ""), 524 | # "name_cn": info.get("企业名称", ""), 525 | # "legal_person": info.get("法定代表人", ""), 526 | # "status": info.get("登记状态", ""), 527 | # "found_date": info.get("成立日期", ""), 528 | # "registered_capital": info.get("注册资本", ""), 529 | # "really_capital": info.get("实缴资本", ""), 530 | # "issue_date": info.get("核准日期", ""), 531 | # "organization_code": info.get("组织机构代码", ""), 532 | # "regist_code": info.get("工商注册号", ""), 533 | # "taxpayer_code": info.get("纳税人识别号", ""), 534 | # "type": info.get("企业类型", ""), 535 | # "license_start_date": info.get("营业期限", "").strip(), 536 | # "taxpayer_crop": info.get("纳税人资质", ""), 537 | # "industry_involved": info.get("所属行业", ""), 538 | # "province": info.get("所属地区", ""), 539 | # "regist_office": info.get("登记机关", ""), 540 | # "staff_size": info.get("人员规模", ""), 541 | # "insured_size": info.get("参保人数", "") if info.get("参保人数", "") else 542 | # [span.strip() for span in table.xpath('tr/td/span/text()') if span.strip()][0], 543 | # "transformer_name": table.xpath('tr/td/div/text()')[-1].strip() if table.xpath('tr/td/div/text()') else "", 544 | # "name_en": info.get("英文名", ""), 545 | # "imp_exp_enterprise_code": info.get("进出口企业代码", ""), 546 | # "regist_address": info.get("注册地址", "") if info.get("注册地址", "") else 547 | # table.xpath('tr/td/a[@class="text-dk"]/text()')[0], 548 | # "business_scope": info.get("经营范围", ""), 549 | # } 550 | # if result.get("license_start_date", ""): 551 | # result["license_start_date"], result["license_end_date"] = (x.strip() for x in 552 | # result["license_start_date"].split( 553 | # "至")) 554 | # else: 555 | # result["license_start_date"], result["license_end_date"] = "", "" 556 | # result["legal_person"] = data.get("legal_person", "") 557 | # data.pop("keyNo") 558 | # logger.info({**data, **result}) 559 | return result 560 | except Exception as e: 561 | logger.info(f'qcc_detail {e} {data["keyNo"]}') 562 | retry = kwargs.get("retry", 0) 563 | retry += 1 564 | if retry >= 2: 565 | return None 566 | kwargs["retry"] = retry 567 | return await qcc_detail(**kwargs) 568 | 569 | 570 | # 爱企查 571 | async def aqc(**kwargs): 572 | try: 573 | meta = { 574 | "url": "https://aiqicha.baidu.com/s", 575 | "params": {"q": kwargs.get("key", ""), "t": "0"}, 576 | "headers": {"Cookie": "", "Referer": 'https://aiqicha.baidu.com/'}, 577 | "proxy": kwargs.get("proxy", ""), 578 | "proxy_user": kwargs.get("proxy_user", ""), 579 | "proxy_pass": kwargs.get("proxy_pass", ""), 580 | } 581 | result = await pub_req(**meta) 582 | if not result: return None 583 | html = result.decode() 584 | content = etree.HTML(html).xpath('//script[1]/text()') 585 | if content: 586 | result = '{"sid"' + content[0].split('{"sid"')[1].split(";\n")[0] 587 | # logger.info(result) 588 | result = json.loads(result) 589 | data_list = [] 590 | for r in result["result"]["resultList"]: 591 | # if not creditCode or r["regNo"] == creditCode: 592 | # return await aqc_detail(**{"data": {"pid": r["pid"]}}) 593 | data_list.append({"pid": r["pid"]}) 594 | tasks = [asyncio.create_task(aqc_detail(**{"data": data_list[i], "proxy": kwargs.get("proxy", "")})) for i 595 | in 596 | range(len(data_list))] 597 | result = await asyncio.gather(*tasks) 598 | return [x for x in result if x] 599 | except Exception as e: 600 | logger.info(f'aqc {e}') 601 | retry = kwargs.get("retry", 0) 602 | retry += 1 603 | if retry >= 2: 604 | return None 605 | kwargs["retry"] = retry 606 | return await aqc(**kwargs) 607 | 608 | 609 | # 爱企查企业详情 610 | async def aqc_detail(**kwargs): 611 | data = kwargs.get("data", "") 612 | if not data: return None 613 | try: 614 | meta = { 615 | "url": "https://aiqicha.baidu.com/detail/basicAllDataAjax", 616 | "params": {"pid": data["pid"]}, 617 | "headers": { 618 | "Cookie": "", 619 | "Referer": f'https://aiqicha.baidu.com/company_detail_{data["pid"]}', 620 | "X-Requested-With": "XMLHttpRequest", 621 | "Zx-Open-Url": f'https://aiqicha.baidu.com/company_detail_{data["pid"]}' 622 | }, 623 | "proxy": kwargs.get("proxy", ""), 624 | "proxy_user": kwargs.get("proxy_user", ""), 625 | "proxy_pass": kwargs.get("proxy_pass", ""), 626 | } 627 | result = await pub_req(**meta) 628 | if not result: return None 629 | result = json.loads(result.decode()) 630 | result = result["data"]["basicData"] if result.get("data", "") else "" 631 | if not result: 632 | retry = kwargs.get("retry", 0) 633 | retry += 1 634 | if retry >= 2: 635 | return None 636 | kwargs["retry"] = retry 637 | return await aqc_detail(**kwargs) 638 | province = f'{result["district"].split("省")[0]}省' if "省" in result.get( 639 | "district", "") else f'{result.get("district", "").split("市")[0]}市' 640 | result = { 641 | "name_cn": result.get("entName", ""), 642 | "name_en": "", 643 | "legal_person": result.get("legalPerson", ""), 644 | "registered_capital": result.get("regCapital", ""), 645 | "really_capital": result.get("realCapital", ""), 646 | "found_date": result.get("startDate", ""), 647 | "issue_date": result.get("annualDate", ""), 648 | "social_credit_code": result.get("unifiedCode", ""), 649 | "organization_code": result.get("orgNo", ""), 650 | "regist_code": result.get("licenseNumber", ""), 651 | "taxpayer_code": result.get("regNo", ""), 652 | "imp_exp_enterprise_code": "", 653 | "industry_involved": result.get("industry", ""), 654 | "type": result.get("entType", ""), 655 | "license_start_date": result.get("startDate", ""), 656 | "license_end_date": result.get("openTime", "").split("至")[-1].strip(), 657 | "regist_office": result.get("authority", ""), 658 | "staff_size": "", 659 | "insured_size": result["insuranceInfo"]["insuranceNum"], 660 | "province": province, 661 | "address": result.get("addr", ""), 662 | "business_scope": result.get("scope", ""), 663 | "email": result.get("email", ""), 664 | "unit_phone": result.get("telephone", ""), 665 | "fax": "", 666 | "website": result.get("website", ""), 667 | "regist_address": result.get("regAddr", ""), 668 | "transformer_name": result["prevEntName"][0] if type(result.get("prevEntName", "")) == list else 669 | result.get("prevEntName", ""), 670 | "status": result.get("openStatus", ""), 671 | } 672 | return result 673 | except Exception as e: 674 | logger.info(f"aqc_detail {e}") 675 | retry = kwargs.get("retry", 0) 676 | retry += 1 677 | if retry >= 2: 678 | return None 679 | kwargs["retry"] = retry 680 | return await aqc_detail(**kwargs) 681 | 682 | 683 | # 国家企业信用信息公示系统 684 | async def gsxt(**kwargs): 685 | try: 686 | meta = { 687 | "method": "POST", 688 | "url": "https://app.gsxt.gov.cn/gsxt/corp-query-app-search-1.html", 689 | "data": { 690 | "conditions": '{"excep_tab":"0","ill_tab":"0","area":"0","cStatus":"0","xzxk":"0","xzcf":"0","dydj":"0"}', 691 | "searchword": kwargs.get("key", ""), "sourceType": "W"}, 692 | "headers": {"X-Requested-With": "XMLHttpRequest"}, 693 | "proxy": kwargs.get("proxy", ""), 694 | "proxy_user": kwargs.get("proxy_user", ""), 695 | "proxy_pass": kwargs.get("proxy_pass", ""), 696 | } 697 | result = await pub_req(**meta) 698 | if not result: return None 699 | result = json.loads(result) 700 | if result.get("data", ""): 701 | data_list = [] 702 | for r in result["data"]["result"]["data"]: 703 | # if not creditCode or r["uniscId"] == creditCode: 704 | # return await gsxt_detail(**{"data": {"pripid": r["pripid"]}}) 705 | data_list.append({"pripid": r["pripid"]}) 706 | tasks = [asyncio.create_task(gsxt_detail(**{"data": data_list[i], "proxy": kwargs.get("proxy", "")})) for i 707 | in 708 | range(len(data_list))] 709 | result = await asyncio.gather(*tasks) 710 | return [x for x in result if x] 711 | except Exception as e: 712 | logger.info(f'gsxt {e}') 713 | retry = kwargs.get("retry", 0) 714 | retry += 1 715 | if retry >= 2: 716 | return None 717 | kwargs["retry"] = retry 718 | return await gsxt(**kwargs) 719 | 720 | 721 | # 国家企业信用信息公示系统公司详情信息 722 | async def gsxt_detail(**kwargs): 723 | data = kwargs.get("data", "") 724 | try: 725 | meta = { 726 | "url": f'https://app.gsxt.gov.cn/gsxt/corp-query-entprise-info-primaryinfoapp-entbaseInfo-{data["pripid"]}.html', 727 | "params": {"nodeNum": "310000", "entType": "6150", "sourceType": "W"}, 728 | "headers": {"Referer": "https://servicewechat.com", "content-type": "application/x-www-form-urlencoded", 729 | "Accept-Encoding": "gzip, deflate, br"}, 730 | "proxy": kwargs.get("proxy", ""), 731 | "proxy_user": kwargs.get("proxy_user", ""), 732 | "proxy_pass": kwargs.get("proxy_pass", ""), 733 | } 734 | res = await pub_req(**meta) 735 | result = { 736 | "name_cn": data.get("entName", "").replace("", "").replace("", ""), 737 | "status": data.get("corpStatusString", ""), 738 | "regist_code": data.get("regNo", ""), 739 | "social_credit_code": data.get("uniscId", ""), 740 | "legal_person": data.get("legelRep", ""), 741 | "type": data.get("entTypeString", ""), 742 | "found_date": data.get("estDate", ""), 743 | "regist_office": data.get("regOrg", ""), 744 | "transformer_name": data.get("historyName", "").replace("", "").replace("", ""), 745 | } 746 | if not res: 747 | return result 748 | res = json.loads(res.decode()) 749 | if res.get("result"): 750 | result = { 751 | "name_cn": res["result"]["entName"], 752 | "name_en": "", 753 | "legal_person": res["result"]["name"], 754 | "registered_capital": f'{res["regCaption"]}{res["regCapCurCN"]}'.strip(), 755 | "really_capital": "", 756 | "found_date": res["result"]["estDate"], 757 | "issue_date": res["result"]["apprDate"], 758 | "social_credit_code": res["result"]["uniscId"], 759 | "organization_code": "", 760 | "regist_code": res["result"]["regNo"], 761 | "taxpayer_code": "", 762 | "imp_exp_enterprise_code": "", 763 | "industry_involved": res["result"]["industryPhy"], 764 | "type": res["result"]["entType_CN"], 765 | "license_start_date": res["result"]["opFrom"], 766 | "license_end_date": res["result"]["opTo"], 767 | "regist_office": res["result"]["regOrg_CN"], 768 | "staff_size": "", 769 | "insured_size": "", 770 | "province": res["nodeNum"], 771 | "address": res["result"]["dom"], 772 | "business_scope": res["result"]["opScope"], 773 | "email": "", 774 | "unit_phone": "", 775 | "fax": "", 776 | "website": "", 777 | "regist_address": res["result"]["dom"], 778 | "transformer_name": data.get("historyName", ""), 779 | "status": res["result"]["regState_CN"], 780 | } 781 | return result 782 | 783 | except Exception as e: 784 | logger.info(f'gsxt_detail {e}') 785 | retry = kwargs.get("retry", 0) 786 | retry += 1 787 | if retry >= 2: 788 | return None 789 | kwargs["retry"] = retry 790 | return await gsxt_detail(**kwargs) 791 | 792 | 793 | async def test(): 794 | # proxy = await get_proxy() 795 | proxy = 'http://127.0.0.1:1080' 796 | logger.info(proxy) 797 | rs = await qcc(**{"key": "特变电工湖南工程有限公司", "proxy": proxy}) 798 | logger.info(rs) 799 | # tasks = [asyncio.create_task(qcc(**{"key": "特变电工湖南工程有限公司", "proxy": proxy})) for x in range(10)] 800 | # await asyncio.gather(*tasks) 801 | 802 | 803 | if __name__ == '__main__': 804 | # import uvicorn 805 | # uvicorn.run(app) 806 | # proxy = 'http://127.0.0.1:1080' 807 | proxy = '' 808 | # rs = asyncio.get_event_loop().run_until_complete(test()) 809 | # rs = asyncio.get_event_loop().run_until_complete(get_proxy()) 810 | # kwargs = {"key": "上海电气集团股份有限公司", "proxy": ""} 811 | # kwargs = {"key": "上海宽娱数码科技有限公司", "proxy": ""} 812 | # kwargs = {"key": "厦门臻旻建筑工程有限公司", "proxy": ""} 813 | kwargs = {"key": "哔哩哔哩", "proxy": ""} 814 | # kwargs = {"key": "广东携众建筑咨询服务有限公司", "proxy": ""} 815 | # kwargs = {"key": "上海茗昊机械工程有限公司", "proxy": ""} 816 | # kwargs = {**kwargs, **sample(rs, 1)[0]} 817 | # rs = asyncio.get_event_loop().run_until_complete(query_ip(**kwargs)) 818 | # rs = asyncio.get_event_loop().run_until_complete(tyc(**kwargs)) 819 | rs = asyncio.get_event_loop().run_until_complete(tyc_detail(**{"id": "3149889182"})) 820 | # rs = asyncio.get_event_loop().run_until_complete(qcc(**kwargs)) 821 | # rs = asyncio.get_event_loop().run_until_complete( 822 | # qcc_detail(**{"data": {"keyNo": "hbdc8d27a2a556cfcac5001e38f41061"}})) 823 | # rs = asyncio.get_event_loop().run_until_complete( 824 | # qcc_detail(**{"data": {"keyNo": "963f4179841540334d3a16db3fc3567d"}})) 825 | # rs = asyncio.get_event_loop().run_until_complete(get_proxy(**{"turn": 1})) 826 | # rs = asyncio.get_event_loop().run_until_complete( 827 | # qcc_detail(**{"url": "https://www.qcc.com/firm/963f4179841540334d3a16db3fc3567d.html"})) 828 | # rs = asyncio.get_event_loop().run_until_complete(aqc(**kwargs)) 829 | # rs = asyncio.get_event_loop().run_until_complete(aqc_detail(**{"data": {"pid": "43880125442188"}})) 830 | # rs = asyncio.get_event_loop().run_until_complete(gsxt(**kwargs)) 831 | # pripid = "D1FDF711DFE03EE312CC2ACD3CE218AB448EC78EC78E61ABE228E2ABE2ABE2ABEEABE2ABDF960DC782CB82C7647C-1618992356543" 832 | # pripid = 'AF2B89C7A13640356C1A541B4234667D3A58B958B9581F7D9C7D9C7D9C7D9C7D1FF213F2F9185FBEDC3DDC3D5F18-1629364295083' 833 | # rs = asyncio.get_event_loop().run_until_complete(gsxt_detail(**{"data": {"pripid": pripid}})) 834 | # rs = asyncio.get_event_loop().run_until_complete(get_proxy()) 835 | # rs = asyncio.get_event_loop().run_until_complete(query_ip(**{"proxy": "http://182.111.108.203:45113"})) 836 | logger.info(rs) 837 | 838 | # Tunnel connection failed: 401 Authorized failed 839 | -------------------------------------------------------------------------------- /看准-企业工商信息查询.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: Litre WU 3 | # E-mail: litre-wu@tutanota.com 4 | # Software: PyCharm 5 | # File: 看准-企业工商信息查询.py 6 | # Time: 1月 06, 2022 7 | import asyncio 8 | from aiohttp import ClientSession, ClientTimeout, TCPConnector 9 | from user_agent import generate_user_agent 10 | from random import randint 11 | from time import sleep 12 | from sys import platform 13 | from json import loads 14 | from bs4 import BeautifulSoup 15 | import pandas as pd 16 | from fastapi import FastAPI 17 | from pydantic import BaseModel, Field 18 | import socket 19 | from boltons.cacheutils import LRU 20 | from hashlib import md5 21 | 22 | if platform == "win32": 23 | asyncio.set_event_loop(asyncio.ProactorEventLoop()) 24 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) 25 | 26 | host = socket.gethostbyname(socket.gethostname()) 27 | 28 | lru_cache = LRU(max_size=100) 29 | 30 | tags_metadata = [ 31 | { 32 | "name": "看准-企业工商信息查询接口", 33 | "description": "看准-企业工商信息查询", 34 | "externalDocs": { 35 | "description": "More", 36 | "url": f"http://{host}/docs", 37 | }, 38 | }, 39 | ] 40 | 41 | contact = { 42 | "name": "Litre", 43 | "url": "http://121.37.209.113", 44 | "email": "litre-wu@tutanota.com", 45 | } 46 | 47 | app = FastAPI(openapi_url="/api/v1/api.json", title="看准-企业工商信息查询接口", contact=contact, openapi_tags=tags_metadata) 48 | 49 | 50 | class SearchItem(BaseModel): 51 | query: str = Field(..., example='哔哩哔哩') 52 | cityCode: str = Field(..., example=0) 53 | industryCodes: str = Field(..., example='') 54 | pageNum: str = Field(..., example=1) 55 | limit: str = Field(..., example=15) 56 | 57 | 58 | # 查询接口 59 | @app.post("/search", tags=["看准-工商信息查询"]) 60 | async def search(data: SearchItem): 61 | kwargs = data.dict() 62 | key = md5(str(kwargs).encode()).hexdigest() 63 | if lru_cache.get(key): return lru_cache[key] 64 | result = await query(**kwargs) 65 | if result: lru_cache[key] = result 66 | return result 67 | 68 | 69 | class InfoItem(BaseModel): 70 | encCompanyId: str = Field(..., example='0XN_2dW7Fw~~') 71 | 72 | 73 | # 工商信息接口 74 | @app.post("/compInfo", tags=["看准-工商信息查询"]) 75 | async def info(data: InfoItem): 76 | kwargs = data.dict() 77 | key = md5(str(kwargs).encode()).hexdigest() 78 | if lru_cache.get(key): return lru_cache[key] 79 | result = await compInfo(**kwargs) 80 | if result: lru_cache[key] = result 81 | return result 82 | 83 | 84 | # 公共请求函数 85 | async def pub_req(**kwargs): 86 | method = kwargs.get("method", "GET") 87 | url = kwargs.get("url", "") 88 | params = kwargs.get("params", {}) 89 | data = kwargs.get("data", {}) 90 | headers = {**{"User-Agent": generate_user_agent()}, **kwargs.get("headers", {})} 91 | proxy = kwargs.get("proxy", "") 92 | timeout = kwargs.get("timeout", 10) 93 | try: 94 | async with asyncio.Semaphore(20): 95 | async with ClientSession(timeout=ClientTimeout(total=3), 96 | connector=TCPConnector(ssl=False), 97 | trust_env=True) as client: 98 | async with client.request(method=method, url=url, params=params, data=data, headers=headers, 99 | proxy=proxy, 100 | timeout=timeout) as rs: 101 | if rs.status == 200 or 201: 102 | content = await rs.read() 103 | return content 104 | else: 105 | sleep(randint(1, 2)) 106 | retry = kwargs.get("retry", 0) 107 | retry += 1 108 | if retry >= 2: 109 | return None 110 | kwargs["retry"] = retry 111 | return await pub_req(**kwargs) 112 | except Exception as e: 113 | print(e) 114 | sleep(randint(1, 2)) 115 | retry = kwargs.get("retry", 0) 116 | retry += 1 117 | if retry >= 2: 118 | return None 119 | kwargs["retry"] = retry 120 | return await pub_req(**kwargs) 121 | 122 | 123 | # 查询 124 | async def query(**kwargs): 125 | meta = { 126 | "url": "https://www.kanzhun.com/search/company_v2.json", 127 | "params": { 128 | "query": kwargs.get("query", "哔哩哔哩"), 129 | "cityCode": kwargs.get("cityCode", 0), 130 | "industryCodes": kwargs.get("industryCodes", ""), 131 | "pageNum": kwargs.get("pageNum", 1), 132 | "limit": kwargs.get("limit", 15), 133 | }, 134 | "headers": { 135 | "Accept-Encoding": "gzip, deflate, br" 136 | } 137 | } 138 | res = await pub_req(**meta) 139 | if not res: return None 140 | # print(res.decode()) 141 | return loads(res) 142 | 143 | 144 | # 工商信息 145 | async def compInfo(**kwargs): 146 | meta = { 147 | "url": f'https://www.kanzhun.com/firm/info/{kwargs.get("encCompanyId", "")}.html', 148 | "headers": { 149 | "Accept-Encoding": "gzip, deflate, br" 150 | } 151 | } 152 | res = await pub_req(**meta) 153 | if not res: return None 154 | soup = BeautifulSoup(res.decode(), 'html.parser') 155 | div = soup.find_all("div", class_="kz-company-desc") 156 | if div: 157 | table = div[0].table 158 | table = pd.read_html(str(table)) 159 | info = {} 160 | for x in table[0].values.tolist(): 161 | for i in range(0, len(x), 2): 162 | if x[i].strip("-"): 163 | info[x[i]] = x[i + 1] 164 | return info 165 | 166 | 167 | async def main(): 168 | rs = await query() 169 | # rs = await compInfo(**{"encCompanyId": "0XN_2dW7Fw~~"}) 170 | print(rs) 171 | 172 | 173 | if __name__ == '__main__': 174 | asyncio.get_event_loop().run_until_complete(main()) 175 | --------------------------------------------------------------------------------