├── Dockerfile
├── README.md
├── conf.py
├── requirements.txt
├── 工商信息查询.py
└── 看准-企业工商信息查询.py
/Dockerfile:
--------------------------------------------------------------------------------
1 | # 基于镜像基础
2 | FROM python:3.10.1
3 |
4 | # 设置时区
5 | ENV TZ Asia/Shanghai
6 |
7 | # 设置代码文件夹工作目录 /app
8 | WORKDIR /app
9 |
10 | # 复制当前代码文件到容器中 /app
11 | ADD . /app
12 |
13 | # 安装所需的包
14 | RUN pip install -r requirements.txt -i https://pypi.doubanio.com/simple/
15 |
16 | CMD ["gunicorn", "-c", "conf.py", "看准-企业工商信息查询:app"]
17 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 企业工商信息查询接口
2 |
3 | 企业工商信息接口(包含天眼查、企查查、爱企查、国家企业公示系统平台、快准)
4 |
5 | 接口文档(http://127.0.0.1:8081/docs)
6 |
7 | tip:代理设置(158行更换)
8 |
9 | 项目运行
10 |
11 | `pip install requirements.txt -i https://pypi.doubanio.com/simple/`
12 |
13 | ` uvicorn 工商信息查询:app --host 0.0.0.0 --port 8081 --reload`
14 |
15 | docker 运行
16 |
17 | `docker build -t businessinfo https://github.com/Litre-WU/businessInfo-api.git `
18 |
19 | `docker run --name businessInfo -d -p 8081:8081 businessinfo`
20 |
--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
1 | # vim: set fileencoding:utf-8
2 | # -*- coding: utf-8 -*-
3 | # Author: Litre WU
4 | # E-mail: litre-wu@tutanota.com
5 | # Software: PyCharm
6 | # File: gunicorn.py
7 | # Time: 4月 23, 2021
8 | import logging
9 | import logging.handlers
10 | from logging.handlers import WatchedFileHandler
11 | import os
12 | import multiprocessing
13 |
14 | # chdir = '/app' # 加载应用程序之前将chdir目录指定到指定目录
15 |
16 | proc_name = 'businessInfo' # 进程名
17 |
18 | bind = '0.0.0.0:8081' # 绑定ip和端口号
19 |
20 | backlog = 512 # 监听队列
21 |
22 | timeout = 10 # 超时
23 |
24 | # worker_class = 'gevent' # 默认的是sync模式
25 | # worker_class = 'uvicorn.workers.UvicornWorker' # 使用uvicorn模式
26 | worker_class = 'uvicorn.workers.UvicornH11Worker' # 使用纯python模式
27 |
28 | # workers = multiprocessing.cpu_count() * 2 + 1 # 进程数
29 | # workers = multiprocessing.cpu_count() * 2 + 1 # 进程数
30 | workers = 4 # 进程数
31 |
32 | threads = 4 # 指定每个进程开启的线程数
33 |
34 | # deamon = True # 守护进程
35 |
36 | reload = True # 自动加载
37 |
38 | worker_connections = 2000 # 设置最大并发量
39 |
40 | loglevel = 'info' # 日志级别,这个日志级别指的是错误日志的级别,而访问日志的级别无法设置
41 |
42 | # accesslog = "/businessInfo/logs/demo_access.log" # 访问日志文件, "-" 表示标准输出
43 |
44 | # errorlog = "/businessInfo/logs/demo_err.log" # 错误日志文件, "-" 表示标准输出
45 |
46 | # access_log_format = '%(h)s %(l)s %(u)s %(t)s'
47 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp
2 | fastapi
3 | user-agent
4 | lxml
5 | uvicorn
6 | gunicorn
7 | pandas
8 | python-multipart
9 | brotlipy
10 | loguru
11 | boltons
12 | beautifulsoup4
13 |
--------------------------------------------------------------------------------
/工商信息查询.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Author: Litre WU
3 | # E-mail: litre-wu@tutanota.com
4 | # Software: PyCharm
5 | # File: 工商信息查询.py
6 | # Time: 4月 21, 2021
7 | import asyncio
8 | from typing import Optional, List
9 | from fastapi import FastAPI, Header, Cookie, Depends, BackgroundTasks
10 | from starlette.requests import Request
11 | from pydantic import BaseModel, Field
12 | from fastapi.responses import JSONResponse
13 | import aiohttp
14 | from user_agent import generate_user_agent
15 | from lxml import etree
16 | import pandas as pd
17 | import json
18 | import time
19 | from random import randint, sample
20 | import os
21 | from json import load, dump
22 | import socket
23 | from sys import platform
24 | from functools import lru_cache
25 | from loguru import logger
26 | from boltons.cacheutils import LRI, LRU
27 | from hashlib import md5
28 |
29 | lri_cache = LRI(max_size=100)
30 | lru_cache = LRU(max_size=100)
31 |
32 | logger.add(f'{os.path.basename(__file__)[:-3]}.log', rotation='200 MB', compression='zip', enqueue=True, serialize=False, encoding='utf-8', retention='7 days')
33 |
34 |
35 | host = socket.gethostbyname(socket.gethostname())
36 |
37 | if platform == "win32":
38 | asyncio.set_event_loop(asyncio.ProactorEventLoop())
39 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
40 |
41 | tags_metadata = [
42 | {
43 | "name": "企业工商信息查询接口",
44 | "description": "企业工商信息查询(天眼查、企查查、爱企查、国家企业公示系统)",
45 | "externalDocs": {
46 | "description": "More",
47 | "url": f"http://{host}/docs",
48 | },
49 | },
50 | ]
51 |
52 | contact = {
53 | "name": "Litre",
54 | "url": "http://121.37.209.113",
55 | "email": "litre-wu@tutanota.com",
56 | }
57 |
58 | app = FastAPI(openapi_url="/api/v1/api.json", title="企业工商信息查询接口", contact=contact, openapi_tags=tags_metadata)
59 |
60 |
61 | # 日志
62 | async def log(request, **kwargs):
63 | ritems = dict(request.items())
64 | if not kwargs: kwargs = ""
65 | log_info = f'{ritems["client"][0]} {ritems["method"]} {ritems["path"]} {ritems["type"]}/{ritems["http_version"]} {kwargs}'
66 | logger.info(log_info)
67 |
68 |
69 | # 首页
70 | @app.get("/", tags=["首页"])
71 | async def index(request: Request, user_agent: Optional[str] = Header(None), x_token: List[str] = Header(None), ):
72 | result = {
73 | "code": 200,
74 | "msg": "来了!老弟",
75 | "result": "你看这个面它又长又宽,就像这个碗它又大又圆",
76 | "info": {
77 | "openapi_url": "/api/v1/openapi.json",
78 | "ip": request.client.host,
79 | "x-token": x_token,
80 | "user-agent": user_agent,
81 | "headers": dict(request.headers)
82 | }
83 | }
84 | return JSONResponse(result)
85 |
86 |
87 | class Qcc(BaseModel):
88 | key: str = Field(..., example='哔哩哔哩')
89 | creditCode: str = Field(..., example='统一社会信用代码(暂不使用)')
90 |
91 |
92 | @app.post("/", tags=["企业工商信息查询接口"])
93 | async def api(data: Qcc, request: Request, background_tasks: BackgroundTasks, x_token: List[str] = Header(None),
94 | user_agent: Optional[str] = Header(None)):
95 | kwargs = data.dict()
96 | await log(request, **kwargs)
97 | key = md5(str(kwargs).encode()).hexdigest()
98 | if lru_cache.get(key): return lru_cache[key]
99 | result = await query(**kwargs)
100 | if result: lru_cache[key] = result
101 | return JSONResponse(result)
102 |
103 |
104 | # 公共请求函数
105 | async def pub_req(**kwargs):
106 | if not kwargs.get("url", ""): return None
107 | headers ={**{
108 | "X-Forwarded-For": ip,
109 | "X-Forwarded": ip,
110 | "Forwarded-For": ip,
111 | "Forwarded": ip,
112 | "X-Forwarded-Proto": ip,
113 | "X-Forwarded-Host": ip,
114 | "X-Requested-With": ip,
115 | "X-Client-IP": ip,
116 | "X-remote-IP": ip,
117 | "X-remote-addr": ip,
118 | "X-Real-IP": ip,
119 | "True-Client-IP": ip,
120 | "Client-IP": ip,
121 | "X_FORWARDED_FOR": ip,
122 | "X_REAL_IP": ip,
123 | "User-Agent": generate_user_agent()
124 | }, **kwargs.get("headers", {})}
125 | try:
126 | # aiohttp
127 | async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=10),
128 | connector=aiohttp.TCPConnector(ssl=False), trust_env=True) as client:
129 | proxy_auth = aiohttp.BasicAuth(kwargs.get("proxy_user", ""), kwargs.get("proxy_pass", ""))
130 | async with client.request(method=kwargs.get("method", "GET"), url=kwargs["url"],
131 | params=kwargs.get("params", {}),
132 | data=kwargs.get("data", {}), headers=headers, proxy=kwargs.get("proxy", ""),
133 | proxy_auth=proxy_auth,
134 | timeout=kwargs.get("timeout", 5)) as rs:
135 | if rs.status == 200:
136 | result = await rs.read()
137 | return result
138 | else:
139 | logger.info(f"pub_req {kwargs} {rs.status} {rs.text}")
140 | time.sleep(randint(1, 2))
141 | retry = kwargs.get("retry", 0)
142 | retry += 1
143 | if retry >= 2:
144 | return None
145 | kwargs["retry"] = retry
146 | return await pub_req(**kwargs)
147 | except Exception as e:
148 | logger.info(f"pub_req {kwargs} {e}")
149 | time.sleep(randint(1, 2))
150 | retry = kwargs.get("retry", 0)
151 | retry += 1
152 | if retry >= 2:
153 | return None
154 | kwargs["retry"] = retry
155 | return await pub_req(**kwargs)
156 |
157 |
158 | # 代理
159 | async def get_proxy(**kwargs):
160 | if not kwargs.get("turn", 0):
161 | time_now = int(time.time())
162 | if not os.path.exists('proxy.json'):
163 | with open('proxy.json', 'w') as f:
164 | dump([], f)
165 | with open('proxy.json', 'r') as f:
166 | data = json.load(f)
167 | if data:
168 | expire_time = int(time.mktime(time.strptime(data[0]["expire_time"], "%Y-%m-%d %H:%M:%S")))
169 | if time_now < expire_time:
170 | return data
171 | # # 番茄代理
172 | # url = 'http://x.fanqieip.com/gip'
173 | # params = {"getType": "3","qty": "1","port": "1","time": "1","city": "0","format": "2","ss": "1","dt": "1","css": ""}
174 | # 芝麻代理
175 | url = 'http://webapi.http.zhimacangku.com/getip'
176 | params = {"num": "1", "type": "2", "pro": "0", "city": "0", "yys": "0", "port": "1", "time": "1", "ts": "1",
177 | "ys": "0", "cs": "0", "lb": "1", "sb": "0", "pb": "4", "mr": "1", "regions": ""}
178 | try:
179 | meta = {
180 | "url": url,
181 | "params": params,
182 | }
183 | result = await pub_req(**meta)
184 | logger.info(result.decode())
185 | if not result: return None
186 | result = json.loads(result)
187 | if result.get("data", ""):
188 | with open('proxy.json', 'w') as f:
189 | json.dump(result["data"], f)
190 | return result["data"]
191 | else:
192 | time.sleep(randint(0, 1))
193 | retry = kwargs.get("retry", 0)
194 | retry += 1
195 | if retry >= 2:
196 | return None
197 | kwargs["retry"] = retry
198 | return await get_proxy(**kwargs)
199 | except Exception as e:
200 | logger.info(e)
201 | retry = kwargs.get("retry", 0)
202 | retry += 1
203 | if retry >= 2:
204 | return None
205 | kwargs["retry"] = retry
206 | return await get_proxy(**kwargs)
207 |
208 |
209 | # IP查询
210 | async def query_ip(**kwargs):
211 | url = 'http://httpbin.org/get?show_env=1'
212 | try:
213 | meta = {
214 | "url": url,
215 | "proxy": kwargs.get("proxy", ""),
216 | "proxy_user": kwargs.get("proxy_user", ""),
217 | "proxy_pass": kwargs.get("proxy_pass", ""),
218 | }
219 | result = await pub_req(**meta)
220 | if not result: return None
221 | result = json.loads(result)
222 | # logger.info(result)
223 | ip = result["origin"].split()[0]
224 | return ip
225 | except Exception as e:
226 | logger.info(f'query_ip {e}')
227 | time.sleep(randint(1, 2))
228 | retry = kwargs.get("retry", 0)
229 | retry += 1
230 | if retry >= 2:
231 | return None
232 | kwargs["retry"] = retry
233 | return await query_ip(**kwargs)
234 |
235 |
236 | # 查询
237 | async def query(**kwargs):
238 | result = await qcc(**kwargs)
239 | result = await tyc(**kwargs) if not result else result
240 | result = await aqc(**kwargs) if not result else result
241 | result = await gsxt(**kwargs) if not result else result
242 | if result:
243 | result = {"code": 200, "msg": "OK", "result": result}
244 | else:
245 | retry = kwargs.get("retry", 0)
246 | retry += 1
247 | kwargs["retry"] = retry
248 | if retry == 1:
249 | # 第一次代理
250 | proxy = await get_proxy()
251 | if proxy:
252 | kwargs = kwargs | {"proxy": f'http://{proxy[0]["ip"]}:{proxy[0]["port"]}'}
253 | return await query(**kwargs)
254 | else:
255 | kwargs = kwargs | {"proxy": ""}
256 | return await query(**kwargs)
257 | if retry > 2:
258 | return {"code": 200, "msg": "Fail", "result": None}
259 | # 第二次更换代理
260 | proxy = await get_proxy(**{"turn": 1})
261 | if proxy:
262 | kwargs = kwargs | {"proxy": f'http://{proxy[0]["ip"]}:{proxy[0]["port"]}'}
263 | else:
264 | kwargs = kwargs | {"proxy": ""}
265 | return await query(**kwargs)
266 | return result
267 |
268 |
269 | # 天眼查
270 | async def tyc(**kwargs):
271 | try:
272 | meta = {
273 | "url": "https://m.tianyancha.com/search",
274 | "params": {"key": kwargs.get("key", "")},
275 | "headers": {"Referer": "https://m.tianyancha.com"},
276 | "proxy": kwargs.get("proxy", ""),
277 | "proxy_user": kwargs.get("proxy_user", ""),
278 | "proxy_pass": kwargs.get("proxy_user", ""),
279 | }
280 | result = await pub_req(**meta)
281 | if not result: return None
282 | html = result.decode()
283 | ids = etree.HTML(html).xpath('//div[@class="search-company-item"]/@onclick')
284 | if not ids: return None
285 | ids = [x.strip("jumpToCompany('").strip("');") for x in ids]
286 | tasks = [asyncio.create_task(tyc_detail(**{"id": ids[i], "proxy": kwargs.get("proxy", "")})) for i in
287 | range(len(ids))]
288 | result = await asyncio.gather(*tasks)
289 | return [x for x in result if x]
290 | except Exception as e:
291 | logger.info(f'tyc {e}')
292 | retry = kwargs.get("retry", 0)
293 | retry += 1
294 | if retry >= 2:
295 | return None
296 | kwargs["retry"] = retry
297 | return await tyc(**kwargs)
298 |
299 |
300 | # 天眼查详情
301 | async def tyc_detail(**kwargs):
302 | _id = kwargs.get("id", "")
303 | if not _id: return None
304 | try:
305 | meta = {
306 | "url": f'https://m.tianyancha.com/company/{_id}',
307 | "headers": {
308 | "Referer": "https://m.tianyancha.com/search",
309 | },
310 | "proxy": kwargs.get("proxy", ""),
311 | "proxy_user": kwargs.get("proxy_user", ""),
312 | "proxy_pass": kwargs.get("proxy_pass", ""),
313 | }
314 | result = await pub_req(**meta)
315 | if not result: return None
316 | html = result.decode()
317 | divs = etree.HTML(html).xpath('//div[@class="content"]/div[@class="divide-content"]/div')
318 | info = [x.xpath('div//text()') for x in divs] if divs else ""
319 | data = {}
320 | if not info:
321 | retry = kwargs.get("retry", 0)
322 | retry += 1
323 | if retry >= 2:
324 | return None
325 | kwargs["retry"] = retry
326 | return await tyc_detail(**kwargs)
327 | for x in info:
328 | if "法定代表人" in x:
329 | if len(x) == 2:
330 | data[x[0]] = x[1]
331 | else:
332 | data[x[0]] = x[2]
333 | elif "经营范围" in x:
334 | data[x[0]] = x[1]
335 | else:
336 | if len(x) > 3:
337 | for i in range(0, len(x), 2):
338 | data[x[i]] = x[i + 1]
339 | else:
340 | data[x[0]] = x[1]
341 | result = {
342 | "social_credit_code": data.get("统一社会信用代码", ""),
343 | "name_cn": etree.HTML(html).xpath('//meta[@name="tyc-wx-title"]/@content')[0],
344 | "legal_person": data.get("法定代表人", ""),
345 | "status": data.get("经营状态", ""),
346 | "found_date": data.get("成立日期", ""),
347 | "registered_capital": data.get("注册资本", ""),
348 | "really_capital": data.get("实缴资本", ""),
349 | "issue_date": data.get("核准日期", ""),
350 | "organization_code": data.get("组织机构代码", ""),
351 | "regist_code": data.get("工商注册号", ""),
352 | "taxpayer_code": data.get("纳税人识别号", ""),
353 | "type": data.get("企业类型", ""),
354 | "license_start_date": data.get("营业期限", ""),
355 | "taxpayer_crop": data.get("纳税人资质", ""),
356 | "industry_involved": data.get("行业", ""),
357 | "province": data.get("所属地区", ""),
358 | "regist_office": data.get("登记机关", ""),
359 | "staff_size": data.get("人员规模", ""),
360 | "insured_size": data.get("参保人数", ""),
361 | "transformer_name": data.get("曾用名", ""),
362 | "name_en": data.get("英文名称", ""),
363 | "imp_exp_enterprise_code": data.get("进出口企业代码", ""),
364 | "address": data.get("注册地址", ""),
365 | "regist_address": data.get("注册地址", ""),
366 | "business_scope": data.get("经营范围", ""),
367 | "email": "",
368 | "unit_phone": "",
369 | "fax": "",
370 | "website": ""
371 | }
372 | if result.get("license_start_date", ""):
373 | result["license_start_date"], result["license_end_date"] = result["license_start_date"].split(
374 | "至")
375 | else:
376 | result["license_start_date"], result["license_end_date"] = "", ""
377 | return result
378 | except Exception as e:
379 | logger.info(f'tyc_detail {e}')
380 | retry = kwargs.get("retry", 0)
381 | retry += 1
382 | if retry >= 2:
383 | return None
384 | kwargs["retry"] = retry
385 | return await tyc_detail(**kwargs)
386 |
387 |
388 | # 企查查
389 | async def qcc(**kwargs):
390 | try:
391 | meta = {
392 | "url": "https://www.qcc.com/web/search",
393 | "params": {"key": kwargs.get("key", "")},
394 | "headers": {
395 | "Cookie": "",
396 | "Referer": f'https://www.qcc.com/web/search?key={kwargs.get("key", "")}'
397 | },
398 | "proxy": kwargs.get("proxy", ""),
399 | "proxy_user": kwargs.get("proxy_user", ""),
400 | "proxy_pass": kwargs.get("proxy_pass", ""),
401 | }
402 | result = await pub_req(**meta)
403 | if not result: return None
404 | html = result.decode()
405 | content = etree.HTML(html).xpath('//script[1]/text()')
406 | content = '{"appState' + content[0].split("appState")[1].split(";(function")[
407 | 0] if content else ""
408 | if not content: return None
409 | result = json.loads(content)
410 | result = result["search"]["searchRes"].get("Result", "") if result else ""
411 | if not result:
412 | return None
413 | data_list = []
414 | for r in result:
415 | data = {
416 | "keyNo": r.get("KeyNo", ""),
417 | "legal_person": r.get("OperName", "").replace("", "").replace("", ""),
418 | "email": r.get("Email", ""),
419 | "unit_phone": r.get("ContactNumber", ""), "fax": "",
420 | "address": r.get("Address", "").replace("", "").replace("", ""),
421 | "website": r.get("GW", "")
422 | }
423 | data_list.append(data)
424 | tasks = [asyncio.create_task(qcc_detail(**{"data": data_list[i], "proxy": kwargs.get("proxy", "")})) for i in
425 | range(len(data_list))]
426 | result = await asyncio.gather(*tasks)
427 | return [x for x in result if x]
428 | except Exception as e:
429 | logger.info(f'qcc {e}')
430 | retry = kwargs.get("retry", 0)
431 | retry += 1
432 | if retry >= 2:
433 | return None
434 | kwargs["retry"] = retry
435 | return await qcc(**kwargs)
436 |
437 |
438 | # 企查查企业详情
439 | async def qcc_detail(**kwargs):
440 | data = kwargs.get("data", "")
441 | if not data: return None
442 | try:
443 | meta = {
444 | # "url": f'https://www.qcc.com/firm/{data["keyNo"]}.html',
445 | "url": f'https://www.qcc.com/cbase/{data["keyNo"]}.html',
446 | # "url": f'https://m.qcc.com/firm/{data["keyNo"]}.html',
447 | "headers": {
448 | "Connection": "close",
449 | "Cookie": "",
450 | # "cookie": "acw_sc__v2=6062bdefc57536ceeeb840ffcf85497a600eef9f",
451 | "Referer": f'https://www.qcc.com/firm/{data["keyNo"]}.html'
452 | # "Referer": f'https://m.qcc.com/firm/{data["keyNo"]}.html',
453 | },
454 | "proxy": kwargs.get("proxy", ""),
455 | "proxy_user": kwargs.get("proxy_user", ""),
456 | "proxy_pass": kwargs.get("proxy_pass", ""),
457 | }
458 | result = await pub_req(**meta)
459 | if not result: return data
460 | tables = pd.read_html(result.decode())
461 | # logger.info(tables)
462 | info_list = []
463 | for t in tables[0].values.tolist():
464 | info_list += t
465 | info = {}
466 | for i, x in enumerate(info_list):
467 | if i % 2 == 0:
468 | if "复制" in x:
469 | continue
470 | info[x] = info_list[i + 1].replace("复制", "").strip()
471 | result = {
472 | "social_credit_code": info.get("统一社会信用代码", ""),
473 | "name_cn": info.get("企业名称", ""),
474 | "legal_person": info.get("法定代表人", ""),
475 | "status": info.get("登记状态", ""),
476 | "found_date": info.get("成立日期", ""),
477 | "registered_capital": info.get("注册资本", ""),
478 | "really_capital": info.get("实缴资本", ""),
479 | "issue_date": info.get("核准日期", ""),
480 | "organization_code": info.get("组织机构代码", ""),
481 | "regist_code": info.get("工商注册号", ""),
482 | "taxpayer_code": info.get("纳税人识别号", ""),
483 | "type": info.get("企业类型", ""),
484 | "license_start_date": info.get("营业期限", ""),
485 | "taxpayer_crop": info.get("纳税人资质", ""),
486 | "industry_involved": info.get("所属行业", ""),
487 | "province": info.get("所属地区", ""),
488 | "regist_office": info.get("登记机关", ""),
489 | "staff_size": info.get("人员规模", ""),
490 | "insured_size": info.get("参保人数", ""),
491 | "transformer_name": info.get("曾用名", ""),
492 | "name_en": info.get("英文名", "").split("(")[0],
493 | "imp_exp_enterprise_code": info.get("进出口企业代码", ""),
494 | "regist_address": info.get("注册地址", "").split()[0],
495 | "business_scope": info.get("经营范围", ""),
496 | }
497 | if result.get("license_start_date", ""):
498 | result["license_start_date"], result["license_end_date"] = (x.strip() for x in
499 | result["license_start_date"].split(
500 | "至"))
501 | else:
502 | result["license_start_date"], result["license_end_date"] = "", ""
503 | data.pop("keyNo")
504 | result = result | data
505 | # # web
506 | # table = etree.HTML(html).xpath('//table[@class="ntable"]')[0] if etree.HTML(html).xpath(
507 | # '//table[@class="ntable"]') else ""
508 | # if type(table) == str:
509 | # retry = kwargs.get("retry", 0)
510 | # retry += 1
511 | # if retry >= 2:
512 | # return False
513 | # kwargs["retry"] = retry
514 | # return await qcc_detail(**kwargs)
515 | # trs = table.xpath('tr')
516 | # if not trs: return None
517 | # tds = []
518 | # for x in trs:
519 | # tds += x.xpath('td[@class="tb"]')
520 | # info = {x.xpath('text()')[0].strip(): x.xpath('following-sibling::node()/text()')[0].strip() for x
521 | # in tds if x.xpath('following-sibling::node()/text()')}
522 | # result = {
523 | # "social_credit_code": info.get("统一社会信用代码", ""),
524 | # "name_cn": info.get("企业名称", ""),
525 | # "legal_person": info.get("法定代表人", ""),
526 | # "status": info.get("登记状态", ""),
527 | # "found_date": info.get("成立日期", ""),
528 | # "registered_capital": info.get("注册资本", ""),
529 | # "really_capital": info.get("实缴资本", ""),
530 | # "issue_date": info.get("核准日期", ""),
531 | # "organization_code": info.get("组织机构代码", ""),
532 | # "regist_code": info.get("工商注册号", ""),
533 | # "taxpayer_code": info.get("纳税人识别号", ""),
534 | # "type": info.get("企业类型", ""),
535 | # "license_start_date": info.get("营业期限", "").strip(),
536 | # "taxpayer_crop": info.get("纳税人资质", ""),
537 | # "industry_involved": info.get("所属行业", ""),
538 | # "province": info.get("所属地区", ""),
539 | # "regist_office": info.get("登记机关", ""),
540 | # "staff_size": info.get("人员规模", ""),
541 | # "insured_size": info.get("参保人数", "") if info.get("参保人数", "") else
542 | # [span.strip() for span in table.xpath('tr/td/span/text()') if span.strip()][0],
543 | # "transformer_name": table.xpath('tr/td/div/text()')[-1].strip() if table.xpath('tr/td/div/text()') else "",
544 | # "name_en": info.get("英文名", ""),
545 | # "imp_exp_enterprise_code": info.get("进出口企业代码", ""),
546 | # "regist_address": info.get("注册地址", "") if info.get("注册地址", "") else
547 | # table.xpath('tr/td/a[@class="text-dk"]/text()')[0],
548 | # "business_scope": info.get("经营范围", ""),
549 | # }
550 | # if result.get("license_start_date", ""):
551 | # result["license_start_date"], result["license_end_date"] = (x.strip() for x in
552 | # result["license_start_date"].split(
553 | # "至"))
554 | # else:
555 | # result["license_start_date"], result["license_end_date"] = "", ""
556 | # result["legal_person"] = data.get("legal_person", "")
557 | # data.pop("keyNo")
558 | # logger.info({**data, **result})
559 | return result
560 | except Exception as e:
561 | logger.info(f'qcc_detail {e} {data["keyNo"]}')
562 | retry = kwargs.get("retry", 0)
563 | retry += 1
564 | if retry >= 2:
565 | return None
566 | kwargs["retry"] = retry
567 | return await qcc_detail(**kwargs)
568 |
569 |
570 | # 爱企查
571 | async def aqc(**kwargs):
572 | try:
573 | meta = {
574 | "url": "https://aiqicha.baidu.com/s",
575 | "params": {"q": kwargs.get("key", ""), "t": "0"},
576 | "headers": {"Cookie": "", "Referer": 'https://aiqicha.baidu.com/'},
577 | "proxy": kwargs.get("proxy", ""),
578 | "proxy_user": kwargs.get("proxy_user", ""),
579 | "proxy_pass": kwargs.get("proxy_pass", ""),
580 | }
581 | result = await pub_req(**meta)
582 | if not result: return None
583 | html = result.decode()
584 | content = etree.HTML(html).xpath('//script[1]/text()')
585 | if content:
586 | result = '{"sid"' + content[0].split('{"sid"')[1].split(";\n")[0]
587 | # logger.info(result)
588 | result = json.loads(result)
589 | data_list = []
590 | for r in result["result"]["resultList"]:
591 | # if not creditCode or r["regNo"] == creditCode:
592 | # return await aqc_detail(**{"data": {"pid": r["pid"]}})
593 | data_list.append({"pid": r["pid"]})
594 | tasks = [asyncio.create_task(aqc_detail(**{"data": data_list[i], "proxy": kwargs.get("proxy", "")})) for i
595 | in
596 | range(len(data_list))]
597 | result = await asyncio.gather(*tasks)
598 | return [x for x in result if x]
599 | except Exception as e:
600 | logger.info(f'aqc {e}')
601 | retry = kwargs.get("retry", 0)
602 | retry += 1
603 | if retry >= 2:
604 | return None
605 | kwargs["retry"] = retry
606 | return await aqc(**kwargs)
607 |
608 |
609 | # 爱企查企业详情
610 | async def aqc_detail(**kwargs):
611 | data = kwargs.get("data", "")
612 | if not data: return None
613 | try:
614 | meta = {
615 | "url": "https://aiqicha.baidu.com/detail/basicAllDataAjax",
616 | "params": {"pid": data["pid"]},
617 | "headers": {
618 | "Cookie": "",
619 | "Referer": f'https://aiqicha.baidu.com/company_detail_{data["pid"]}',
620 | "X-Requested-With": "XMLHttpRequest",
621 | "Zx-Open-Url": f'https://aiqicha.baidu.com/company_detail_{data["pid"]}'
622 | },
623 | "proxy": kwargs.get("proxy", ""),
624 | "proxy_user": kwargs.get("proxy_user", ""),
625 | "proxy_pass": kwargs.get("proxy_pass", ""),
626 | }
627 | result = await pub_req(**meta)
628 | if not result: return None
629 | result = json.loads(result.decode())
630 | result = result["data"]["basicData"] if result.get("data", "") else ""
631 | if not result:
632 | retry = kwargs.get("retry", 0)
633 | retry += 1
634 | if retry >= 2:
635 | return None
636 | kwargs["retry"] = retry
637 | return await aqc_detail(**kwargs)
638 | province = f'{result["district"].split("省")[0]}省' if "省" in result.get(
639 | "district", "") else f'{result.get("district", "").split("市")[0]}市'
640 | result = {
641 | "name_cn": result.get("entName", ""),
642 | "name_en": "",
643 | "legal_person": result.get("legalPerson", ""),
644 | "registered_capital": result.get("regCapital", ""),
645 | "really_capital": result.get("realCapital", ""),
646 | "found_date": result.get("startDate", ""),
647 | "issue_date": result.get("annualDate", ""),
648 | "social_credit_code": result.get("unifiedCode", ""),
649 | "organization_code": result.get("orgNo", ""),
650 | "regist_code": result.get("licenseNumber", ""),
651 | "taxpayer_code": result.get("regNo", ""),
652 | "imp_exp_enterprise_code": "",
653 | "industry_involved": result.get("industry", ""),
654 | "type": result.get("entType", ""),
655 | "license_start_date": result.get("startDate", ""),
656 | "license_end_date": result.get("openTime", "").split("至")[-1].strip(),
657 | "regist_office": result.get("authority", ""),
658 | "staff_size": "",
659 | "insured_size": result["insuranceInfo"]["insuranceNum"],
660 | "province": province,
661 | "address": result.get("addr", ""),
662 | "business_scope": result.get("scope", ""),
663 | "email": result.get("email", ""),
664 | "unit_phone": result.get("telephone", ""),
665 | "fax": "",
666 | "website": result.get("website", ""),
667 | "regist_address": result.get("regAddr", ""),
668 | "transformer_name": result["prevEntName"][0] if type(result.get("prevEntName", "")) == list else
669 | result.get("prevEntName", ""),
670 | "status": result.get("openStatus", ""),
671 | }
672 | return result
673 | except Exception as e:
674 | logger.info(f"aqc_detail {e}")
675 | retry = kwargs.get("retry", 0)
676 | retry += 1
677 | if retry >= 2:
678 | return None
679 | kwargs["retry"] = retry
680 | return await aqc_detail(**kwargs)
681 |
682 |
683 | # 国家企业信用信息公示系统
684 | async def gsxt(**kwargs):
685 | try:
686 | meta = {
687 | "method": "POST",
688 | "url": "https://app.gsxt.gov.cn/gsxt/corp-query-app-search-1.html",
689 | "data": {
690 | "conditions": '{"excep_tab":"0","ill_tab":"0","area":"0","cStatus":"0","xzxk":"0","xzcf":"0","dydj":"0"}',
691 | "searchword": kwargs.get("key", ""), "sourceType": "W"},
692 | "headers": {"X-Requested-With": "XMLHttpRequest"},
693 | "proxy": kwargs.get("proxy", ""),
694 | "proxy_user": kwargs.get("proxy_user", ""),
695 | "proxy_pass": kwargs.get("proxy_pass", ""),
696 | }
697 | result = await pub_req(**meta)
698 | if not result: return None
699 | result = json.loads(result)
700 | if result.get("data", ""):
701 | data_list = []
702 | for r in result["data"]["result"]["data"]:
703 | # if not creditCode or r["uniscId"] == creditCode:
704 | # return await gsxt_detail(**{"data": {"pripid": r["pripid"]}})
705 | data_list.append({"pripid": r["pripid"]})
706 | tasks = [asyncio.create_task(gsxt_detail(**{"data": data_list[i], "proxy": kwargs.get("proxy", "")})) for i
707 | in
708 | range(len(data_list))]
709 | result = await asyncio.gather(*tasks)
710 | return [x for x in result if x]
711 | except Exception as e:
712 | logger.info(f'gsxt {e}')
713 | retry = kwargs.get("retry", 0)
714 | retry += 1
715 | if retry >= 2:
716 | return None
717 | kwargs["retry"] = retry
718 | return await gsxt(**kwargs)
719 |
720 |
721 | # 国家企业信用信息公示系统公司详情信息
722 | async def gsxt_detail(**kwargs):
723 | data = kwargs.get("data", "")
724 | try:
725 | meta = {
726 | "url": f'https://app.gsxt.gov.cn/gsxt/corp-query-entprise-info-primaryinfoapp-entbaseInfo-{data["pripid"]}.html',
727 | "params": {"nodeNum": "310000", "entType": "6150", "sourceType": "W"},
728 | "headers": {"Referer": "https://servicewechat.com", "content-type": "application/x-www-form-urlencoded",
729 | "Accept-Encoding": "gzip, deflate, br"},
730 | "proxy": kwargs.get("proxy", ""),
731 | "proxy_user": kwargs.get("proxy_user", ""),
732 | "proxy_pass": kwargs.get("proxy_pass", ""),
733 | }
734 | res = await pub_req(**meta)
735 | result = {
736 | "name_cn": data.get("entName", "").replace("", "").replace("", ""),
737 | "status": data.get("corpStatusString", ""),
738 | "regist_code": data.get("regNo", ""),
739 | "social_credit_code": data.get("uniscId", ""),
740 | "legal_person": data.get("legelRep", ""),
741 | "type": data.get("entTypeString", ""),
742 | "found_date": data.get("estDate", ""),
743 | "regist_office": data.get("regOrg", ""),
744 | "transformer_name": data.get("historyName", "").replace("", "").replace("", ""),
745 | }
746 | if not res:
747 | return result
748 | res = json.loads(res.decode())
749 | if res.get("result"):
750 | result = {
751 | "name_cn": res["result"]["entName"],
752 | "name_en": "",
753 | "legal_person": res["result"]["name"],
754 | "registered_capital": f'{res["regCaption"]}{res["regCapCurCN"]}'.strip(),
755 | "really_capital": "",
756 | "found_date": res["result"]["estDate"],
757 | "issue_date": res["result"]["apprDate"],
758 | "social_credit_code": res["result"]["uniscId"],
759 | "organization_code": "",
760 | "regist_code": res["result"]["regNo"],
761 | "taxpayer_code": "",
762 | "imp_exp_enterprise_code": "",
763 | "industry_involved": res["result"]["industryPhy"],
764 | "type": res["result"]["entType_CN"],
765 | "license_start_date": res["result"]["opFrom"],
766 | "license_end_date": res["result"]["opTo"],
767 | "regist_office": res["result"]["regOrg_CN"],
768 | "staff_size": "",
769 | "insured_size": "",
770 | "province": res["nodeNum"],
771 | "address": res["result"]["dom"],
772 | "business_scope": res["result"]["opScope"],
773 | "email": "",
774 | "unit_phone": "",
775 | "fax": "",
776 | "website": "",
777 | "regist_address": res["result"]["dom"],
778 | "transformer_name": data.get("historyName", ""),
779 | "status": res["result"]["regState_CN"],
780 | }
781 | return result
782 |
783 | except Exception as e:
784 | logger.info(f'gsxt_detail {e}')
785 | retry = kwargs.get("retry", 0)
786 | retry += 1
787 | if retry >= 2:
788 | return None
789 | kwargs["retry"] = retry
790 | return await gsxt_detail(**kwargs)
791 |
792 |
793 | async def test():
794 | # proxy = await get_proxy()
795 | proxy = 'http://127.0.0.1:1080'
796 | logger.info(proxy)
797 | rs = await qcc(**{"key": "特变电工湖南工程有限公司", "proxy": proxy})
798 | logger.info(rs)
799 | # tasks = [asyncio.create_task(qcc(**{"key": "特变电工湖南工程有限公司", "proxy": proxy})) for x in range(10)]
800 | # await asyncio.gather(*tasks)
801 |
802 |
803 | if __name__ == '__main__':
804 | # import uvicorn
805 | # uvicorn.run(app)
806 | # proxy = 'http://127.0.0.1:1080'
807 | proxy = ''
808 | # rs = asyncio.get_event_loop().run_until_complete(test())
809 | # rs = asyncio.get_event_loop().run_until_complete(get_proxy())
810 | # kwargs = {"key": "上海电气集团股份有限公司", "proxy": ""}
811 | # kwargs = {"key": "上海宽娱数码科技有限公司", "proxy": ""}
812 | # kwargs = {"key": "厦门臻旻建筑工程有限公司", "proxy": ""}
813 | kwargs = {"key": "哔哩哔哩", "proxy": ""}
814 | # kwargs = {"key": "广东携众建筑咨询服务有限公司", "proxy": ""}
815 | # kwargs = {"key": "上海茗昊机械工程有限公司", "proxy": ""}
816 | # kwargs = {**kwargs, **sample(rs, 1)[0]}
817 | # rs = asyncio.get_event_loop().run_until_complete(query_ip(**kwargs))
818 | # rs = asyncio.get_event_loop().run_until_complete(tyc(**kwargs))
819 | rs = asyncio.get_event_loop().run_until_complete(tyc_detail(**{"id": "3149889182"}))
820 | # rs = asyncio.get_event_loop().run_until_complete(qcc(**kwargs))
821 | # rs = asyncio.get_event_loop().run_until_complete(
822 | # qcc_detail(**{"data": {"keyNo": "hbdc8d27a2a556cfcac5001e38f41061"}}))
823 | # rs = asyncio.get_event_loop().run_until_complete(
824 | # qcc_detail(**{"data": {"keyNo": "963f4179841540334d3a16db3fc3567d"}}))
825 | # rs = asyncio.get_event_loop().run_until_complete(get_proxy(**{"turn": 1}))
826 | # rs = asyncio.get_event_loop().run_until_complete(
827 | # qcc_detail(**{"url": "https://www.qcc.com/firm/963f4179841540334d3a16db3fc3567d.html"}))
828 | # rs = asyncio.get_event_loop().run_until_complete(aqc(**kwargs))
829 | # rs = asyncio.get_event_loop().run_until_complete(aqc_detail(**{"data": {"pid": "43880125442188"}}))
830 | # rs = asyncio.get_event_loop().run_until_complete(gsxt(**kwargs))
831 | # pripid = "D1FDF711DFE03EE312CC2ACD3CE218AB448EC78EC78E61ABE228E2ABE2ABE2ABEEABE2ABDF960DC782CB82C7647C-1618992356543"
832 | # pripid = 'AF2B89C7A13640356C1A541B4234667D3A58B958B9581F7D9C7D9C7D9C7D9C7D1FF213F2F9185FBEDC3DDC3D5F18-1629364295083'
833 | # rs = asyncio.get_event_loop().run_until_complete(gsxt_detail(**{"data": {"pripid": pripid}}))
834 | # rs = asyncio.get_event_loop().run_until_complete(get_proxy())
835 | # rs = asyncio.get_event_loop().run_until_complete(query_ip(**{"proxy": "http://182.111.108.203:45113"}))
836 | logger.info(rs)
837 |
838 | # Tunnel connection failed: 401 Authorized failed
839 |
--------------------------------------------------------------------------------
/看准-企业工商信息查询.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Author: Litre WU
3 | # E-mail: litre-wu@tutanota.com
4 | # Software: PyCharm
5 | # File: 看准-企业工商信息查询.py
6 | # Time: 1月 06, 2022
7 | import asyncio
8 | from aiohttp import ClientSession, ClientTimeout, TCPConnector
9 | from user_agent import generate_user_agent
10 | from random import randint
11 | from time import sleep
12 | from sys import platform
13 | from json import loads
14 | from bs4 import BeautifulSoup
15 | import pandas as pd
16 | from fastapi import FastAPI
17 | from pydantic import BaseModel, Field
18 | import socket
19 | from boltons.cacheutils import LRU
20 | from hashlib import md5
21 |
22 | if platform == "win32":
23 | asyncio.set_event_loop(asyncio.ProactorEventLoop())
24 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
25 |
26 | host = socket.gethostbyname(socket.gethostname())
27 |
28 | lru_cache = LRU(max_size=100)
29 |
30 | tags_metadata = [
31 | {
32 | "name": "看准-企业工商信息查询接口",
33 | "description": "看准-企业工商信息查询",
34 | "externalDocs": {
35 | "description": "More",
36 | "url": f"http://{host}/docs",
37 | },
38 | },
39 | ]
40 |
41 | contact = {
42 | "name": "Litre",
43 | "url": "http://121.37.209.113",
44 | "email": "litre-wu@tutanota.com",
45 | }
46 |
47 | app = FastAPI(openapi_url="/api/v1/api.json", title="看准-企业工商信息查询接口", contact=contact, openapi_tags=tags_metadata)
48 |
49 |
50 | class SearchItem(BaseModel):
51 | query: str = Field(..., example='哔哩哔哩')
52 | cityCode: str = Field(..., example=0)
53 | industryCodes: str = Field(..., example='')
54 | pageNum: str = Field(..., example=1)
55 | limit: str = Field(..., example=15)
56 |
57 |
58 | # 查询接口
59 | @app.post("/search", tags=["看准-工商信息查询"])
60 | async def search(data: SearchItem):
61 | kwargs = data.dict()
62 | key = md5(str(kwargs).encode()).hexdigest()
63 | if lru_cache.get(key): return lru_cache[key]
64 | result = await query(**kwargs)
65 | if result: lru_cache[key] = result
66 | return result
67 |
68 |
69 | class InfoItem(BaseModel):
70 | encCompanyId: str = Field(..., example='0XN_2dW7Fw~~')
71 |
72 |
73 | # 工商信息接口
74 | @app.post("/compInfo", tags=["看准-工商信息查询"])
75 | async def info(data: InfoItem):
76 | kwargs = data.dict()
77 | key = md5(str(kwargs).encode()).hexdigest()
78 | if lru_cache.get(key): return lru_cache[key]
79 | result = await compInfo(**kwargs)
80 | if result: lru_cache[key] = result
81 | return result
82 |
83 |
84 | # 公共请求函数
85 | async def pub_req(**kwargs):
86 | method = kwargs.get("method", "GET")
87 | url = kwargs.get("url", "")
88 | params = kwargs.get("params", {})
89 | data = kwargs.get("data", {})
90 | headers = {**{"User-Agent": generate_user_agent()}, **kwargs.get("headers", {})}
91 | proxy = kwargs.get("proxy", "")
92 | timeout = kwargs.get("timeout", 10)
93 | try:
94 | async with asyncio.Semaphore(20):
95 | async with ClientSession(timeout=ClientTimeout(total=3),
96 | connector=TCPConnector(ssl=False),
97 | trust_env=True) as client:
98 | async with client.request(method=method, url=url, params=params, data=data, headers=headers,
99 | proxy=proxy,
100 | timeout=timeout) as rs:
101 | if rs.status == 200 or 201:
102 | content = await rs.read()
103 | return content
104 | else:
105 | sleep(randint(1, 2))
106 | retry = kwargs.get("retry", 0)
107 | retry += 1
108 | if retry >= 2:
109 | return None
110 | kwargs["retry"] = retry
111 | return await pub_req(**kwargs)
112 | except Exception as e:
113 | print(e)
114 | sleep(randint(1, 2))
115 | retry = kwargs.get("retry", 0)
116 | retry += 1
117 | if retry >= 2:
118 | return None
119 | kwargs["retry"] = retry
120 | return await pub_req(**kwargs)
121 |
122 |
123 | # 查询
124 | async def query(**kwargs):
125 | meta = {
126 | "url": "https://www.kanzhun.com/search/company_v2.json",
127 | "params": {
128 | "query": kwargs.get("query", "哔哩哔哩"),
129 | "cityCode": kwargs.get("cityCode", 0),
130 | "industryCodes": kwargs.get("industryCodes", ""),
131 | "pageNum": kwargs.get("pageNum", 1),
132 | "limit": kwargs.get("limit", 15),
133 | },
134 | "headers": {
135 | "Accept-Encoding": "gzip, deflate, br"
136 | }
137 | }
138 | res = await pub_req(**meta)
139 | if not res: return None
140 | # print(res.decode())
141 | return loads(res)
142 |
143 |
144 | # 工商信息
145 | async def compInfo(**kwargs):
146 | meta = {
147 | "url": f'https://www.kanzhun.com/firm/info/{kwargs.get("encCompanyId", "")}.html',
148 | "headers": {
149 | "Accept-Encoding": "gzip, deflate, br"
150 | }
151 | }
152 | res = await pub_req(**meta)
153 | if not res: return None
154 | soup = BeautifulSoup(res.decode(), 'html.parser')
155 | div = soup.find_all("div", class_="kz-company-desc")
156 | if div:
157 | table = div[0].table
158 | table = pd.read_html(str(table))
159 | info = {}
160 | for x in table[0].values.tolist():
161 | for i in range(0, len(x), 2):
162 | if x[i].strip("-"):
163 | info[x[i]] = x[i + 1]
164 | return info
165 |
166 |
167 | async def main():
168 | rs = await query()
169 | # rs = await compInfo(**{"encCompanyId": "0XN_2dW7Fw~~"})
170 | print(rs)
171 |
172 |
173 | if __name__ == '__main__':
174 | asyncio.get_event_loop().run_until_complete(main())
175 |
--------------------------------------------------------------------------------