├── requirements.txt
├── README.md
├── auth.py
├── repeat.py
├── utils.py
├── connection
    └── __init__.py
├── .gitignore
├── main.py
└── database
    └── __init__.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | httpx
2 | selenium
3 | tqdm
4 | sqlalchemy
5 | bs4
6 | pymysql


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ZhiHu_Spider
2 | 知乎爬虫
3 | 
4 | 用于爬取知乎页面 **话题** **问题** **回答** **评论** 的爬虫
5 | 
6 |  - **支持 asyncio 异步高并发**
7 |  - **支持多用户登陆**
8 | 


--------------------------------------------------------------------------------
/auth.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from utils import load_all_cookies, cookies_list
 4 | 
 5 | from selenium import webdriver
 6 | 
 7 | 
 8 | def main():
 9 |     driver = webdriver.Chrome()
10 |     load_all_cookies()
11 |     for cookies in cookies_list:
12 |         driver.get('https://www.zhihu.com/')
13 |         for key, value in cookies.items():
14 |             driver.add_cookie({'name': key, 'value': value})
15 | 
16 |         time.sleep(1)
17 |         driver.get('https://www.zhihu.com/')
18 |         pass
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     main()
23 | 


--------------------------------------------------------------------------------
/repeat.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import func, Table, MetaData
 2 | from database import Database, zhihu_question, zhihu_answer, zhihu_article, zhihu_topic, zhihu_user, zhihu_comment
 3 | 
 4 | 
 5 | def distinct(table):
 6 |     dump = (db.session.query(table.uid).having(func.count(table.uid) > 1)
 7 |             .group_by(table.uid).all())
 8 | 
 9 |     for row in dump:
10 |         rows = db.session.query(table).filter(
11 |             table.uid == row[0]).all()
12 |         for i in rows[1:]:
13 |             db.session.delete(i)
14 |     db.session.commit()
15 |     pass
16 | 
17 | if __name__ == '__main__':
18 |     db = Database(
19 |         'mysql+pymysql://root:20131114@localhost:3306/zhihu?charset=utf8mb4')
20 |     distinct(zhihu_answer)
21 |     distinct(zhihu_article)
22 |     distinct(zhihu_comment)
23 |     distinct(zhihu_question)
24 |     distinct(zhihu_topic)
25 |     distinct(zhihu_user)
26 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pathlib
 3 | import random
 4 | 
 5 | import httpx
 6 | from tqdm import tqdm
 7 | from selenium import webdriver
 8 | 
 9 | cookies_list = []
10 | cookies_path = pathlib.Path('./cookies/')
11 | cookies_path.mkdir(exist_ok=True)
12 | 
13 | 
14 | idx = 0
15 | 
16 | def remove_cookies(cookies: httpx.Cookies) -> None:
17 |     global cookies_list, idx
18 |     # cookies_list.remove(cookies)
19 |     tqdm.write(f'Remove cookies {cookies.file_name}')
20 |     idx %= len(cookies_list)
21 | 
22 | 
23 | def load_all_cookies() -> None:
24 |     global cookies_list
25 |     for cookies in pathlib.Path(cookies_path).glob('*.json'):
26 |         with open(cookies, 'r', encoding='u8') as f:
27 |             s = json.load(f)
28 |             httpx_cookies = httpx.Cookies()
29 |             for key in s:
30 |                 httpx_cookies.set(key['name'], key['value'], domain=key['domain'], path=key['path'])
31 |             httpx_cookies.file_name = cookies.stem
32 |             cookies_list.append(httpx_cookies)
33 | 
34 | 
35 | def save_cookies() -> None:
36 |     def auto_increment() -> int:
37 |         cookies = list(pathlib.Path(cookies_path).glob('*.json'))
38 |         if not cookies:
39 |             return 1
40 |         return max([int(i.stem) for i in cookies]) + 1
41 | 
42 |     driver = webdriver.Chrome()
43 |     driver.get('https://www.zhihu.com/signin?next=%2F')
44 |     while True:
45 |         if driver.current_url == 'https://www.zhihu.com/':
46 |             break
47 |     cookies = driver.get_cookies()
48 |     with open(cookies_path / f'{auto_increment()}.json', 'w') as f:
49 |         json.dump(cookies, f, ensure_ascii=False)
50 | 
51 | 
52 | def get_random_cookies() -> httpx.Cookies:
53 |     if len(cookies_list) == 0:
54 |         raise NotImplementedError('No cookies')
55 |     global idx
56 |     idx %= len(cookies_list)
57 |     cookies = cookies_list[idx]
58 |     idx += 1
59 |     return cookies
60 | 


--------------------------------------------------------------------------------
/connection/__init__.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | from urllib.parse import urlencode
 4 | 
 5 | import httpx
 6 | from tqdm import tqdm
 7 | 
 8 | from utils import get_random_cookies, remove_cookies
 9 | 
10 | limit = asyncio.Semaphore(5)
11 | 
12 | 
13 | class API:
14 |     base_url = 'https://api.zhihu.com/'
15 |     v4_url = 'https://www.zhihu.com/api/v4/'
16 |     v5_url = 'https://www.zhihu.com/api/v5/'
17 | 
18 |     async def get(self, msg: str, types: str = 'base') -> httpx.Response:
19 |         if types == 'base':
20 |             url = self.base_url
21 |         elif types == 'v4':
22 |             url = self.v4_url
23 |         elif types == 'v5':
24 |             url = self.v5_url
25 |         else:
26 |             raise ValueError('types must be "base", "v4" or "v5"')
27 |         try:
28 |             cookies = get_random_cookies()
29 |             async with limit:
30 |                 async with httpx.AsyncClient(follow_redirects=True, cookies=cookies, timeout=3) as client:
31 |                     resp = await client.get(url + msg)
32 |                     await asyncio.sleep(0.3)
33 |             if resp.status_code == 403:
34 |                 remove_cookies(cookies)
35 |                 raise ConnectionRefusedError
36 |             assert resp.status_code == 200
37 |         except ConnectionRefusedError:
38 |             # 输出状态码
39 |             tqdm.write(f'\tError: {resp.status_code} {url + msg}')
40 |             return await self.get(msg, types)
41 |         except httpx.ConnectTimeout:
42 |             return await self.get(msg, types)
43 |         except Exception as e:
44 |             # 输出错误类型
45 |             tqdm.write(f'\tError: {type(e)} {url + msg}')
46 |             return await self.get(msg, types)
47 |         return resp
48 | 
49 |     async def get_topic(self, ids: int, path: str, arg=None,
50 |                         types: str = 'base') -> httpx.Response:
51 |         if arg is None:
52 |             arg = {}
53 |         msg = f'topics/{ids}' + path + \
54 |               (('?' + urlencode(arg)) if urlencode(arg) else '')
55 |         return await self.get(msg, types=types)
56 | 
57 |     async def get_article(self, ids: int, path: str, arg=None,
58 |                           types: str = 'base') -> httpx.Response:
59 |         if arg is None:
60 |             arg = {}
61 |         msg = f'articles/{ids}' + path + \
62 |               (('?' + urlencode(arg)) if urlencode(arg) else '')
63 |         return await self.get(msg, types=types)
64 | 
65 |     async def get_answer(self, ids: int, path: str, arg=None,
66 |                          types: str = 'base') -> httpx.Response:
67 |         if arg is None:
68 |             arg = {}
69 |         msg = f'answers/{ids}' + path + \
70 |               (('?' + urlencode(arg)) if urlencode(arg) else '')
71 |         return await self.get(msg, types=types)
72 | 
73 |     async def get_question(self, ids: int, path: str, arg=None,
74 |                            types: str = 'base') -> httpx.Response:
75 |         if arg is None:
76 |             arg = {}
77 |         msg = f'questions/{ids}' + path + \
78 |               (('?' + urlencode(arg)) if urlencode(arg) else '')
79 |         return await self.get(msg, types=types)
80 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 | 
162 | *.json


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | 
  3 | from sqlalchemy import distinct
  4 | from tqdm import tqdm
  5 | 
  6 | from connection import API
  7 | from database import *
  8 | from utils import load_all_cookies, save_cookies
  9 | 
 10 | api = API()
 11 | 
 12 | 
 13 | async def get_topic(topic_id: int) -> None:
 14 |     info = (
 15 |         await api.get_topic(topic_id, '', arg={'include': 'created,updated'}, types='v5')).json()
 16 |     db.inserts(zhihu_topic.load(info))
 17 | 
 18 |     num = (await api.get_topic(topic_id, '/feeds/essence', arg={'limit': 1})).json().get('paging',
 19 |                                                                                          {}).get(
 20 |         'totals', 0)
 21 |     for i in range(0, num, 20):
 22 |         data = (await api.get_topic(topic_id, '/feeds/essence', {'limit': 20, 'offset': i},
 23 |                                     types='v5')).json()
 24 |         if not data.get('data', None):
 25 |             break
 26 |         data = data.get('data')
 27 |         answer_list = []
 28 |         article_list = []
 29 |         user_list = []
 30 |         question_list = []
 31 |         for j in data:
 32 |             target = j.get('target')
 33 |             user_list.append(target.get('author'))
 34 |             types = target.get('type')
 35 |             if types == 'answer':
 36 |                 question_list.append(target.get('question'))
 37 |                 answer_list.append(target)
 38 |             elif types == 'article':
 39 |                 article_list.append(target)
 40 | 
 41 |         if answer_list:
 42 |             db.inserts([zhihu_answer.load(i) for i in answer_list])
 43 |         if article_list:
 44 |             db.inserts([zhihu_article.load(i) for i in article_list])
 45 |         if user_list:
 46 |             db.inserts([zhihu_user.load(i) for i in user_list])
 47 |         if question_list:
 48 |             db.inserts([zhihu_question.load(i) for i in question_list])
 49 | 
 50 | 
 51 | async def get_question_answer(question_id: int) -> None:
 52 |     num = (await api.get_question(question_id, '/answers', arg={'limit': 1})).json().get('paging',
 53 |                                                                                          {}).get(
 54 |         'totals', 0)
 55 |     for j in range(0, num, 20):
 56 |         arg = {'limit': 20, 'offset': j,
 57 |                'include': 'content,voteup_count,favlists_count,comment_count,is_labeled'}
 58 |         data = (await api.get_question(question_id, '/answers', arg=arg)).json()
 59 |         if not data.get('data', None):
 60 |             break
 61 |         data = data.get('data')
 62 |         answer_list = []
 63 |         user_list = []
 64 |         for k in data:
 65 |             user_list.append(k.get('author'))
 66 |             answer_list.append(k)
 67 | 
 68 |         if answer_list:
 69 |             db.inserts([zhihu_answer.load(i) for i in answer_list])
 70 |         if user_list:
 71 |             db.inserts([zhihu_user.load(i) for i in user_list])
 72 | 
 73 | 
 74 | async def get_answer_comment(answer_id: int) -> None:
 75 |     num = (await api.get_answer(answer_id, '/comments', arg={'limit': 1})).json().get('paging',
 76 |                                                                                       {}).get(
 77 |         'totals', 0)
 78 |     for j in range(0, num, 20):
 79 |         arg = {'limit': 20, 'offset': j}
 80 |         data = (await api.get_answer(answer_id, '/comments', arg=arg)).json()
 81 |         if not data.get('data', None):
 82 |             break
 83 |         data = data.get('data')
 84 |         comment_list = []
 85 |         user_list = []
 86 |         for k in data:
 87 |             user_list.append(k.get('author', {}).get('member', {}))
 88 |             comment_list.append(k)
 89 | 
 90 |         if comment_list:
 91 |             db.inserts([zhihu_comment.load(i) for i in comment_list])
 92 |         if user_list:
 93 |             db.inserts([zhihu_user.load(i) for i in user_list])
 94 | 
 95 | 
 96 | async def get_question_comment(question_id: int) -> None:
 97 |     num = (await api.get_question(question_id, '/comments', arg={'limit': 1})).json().get(
 98 |         'paging', {}).get('totals', 0)
 99 |     for j in range(0, num, 20):
100 |         arg = {'limit': 20, 'offset': j}
101 |         data = (await api.get_question(question_id, '/comments', arg=arg)).json()
102 |         if not data.get('data', None):
103 |             break
104 |         data = data.get('data')
105 |         comment_list = []
106 |         user_list = []
107 |         for k in data:
108 |             user_list.append(k.get('author', {}).get('member', {}))
109 |             comment_list.append(k)
110 | 
111 |         if comment_list:
112 |             db.inserts([zhihu_comment.load(i) for i in comment_list])
113 |         if user_list:
114 |             db.inserts([zhihu_user.load(i) for i in user_list])
115 | 
116 | 
117 | async def get_article_comment(article_id: int) -> None:
118 |     num = (await api.get_article(article_id, '/comments', arg={'limit': 1})).json().get('paging',
119 |                                                                                         {}).get(
120 |         'totals', 0)
121 |     for j in range(0, num, 20):
122 |         arg = {'limit': 20, 'offset': j}
123 |         data = (await api.get_article(article_id, '/comments', arg=arg)).json()
124 |         if not data.get('data', None):
125 |             break
126 |         data = data.get('data')
127 |         comment_list = []
128 |         user_list = []
129 |         for k in data:
130 |             user_list.append(k.get('author', {}).get('member', {}))
131 |             comment_list.append(k)
132 | 
133 |         if comment_list:
134 |             db.inserts([zhihu_comment.load(i) for i in comment_list])
135 |         if user_list:
136 |             db.inserts([zhihu_user.load(i) for i in user_list])
137 | 
138 | 
139 | async def get_all_topic() -> None:
140 |     topic_list = [23507285, 26640843, 27795532, 20205523, 25671250, 23560902, 21763228]
141 |     # topic_list = [23507285]
142 |     tasks = [asyncio.create_task(get_topic(i)) for i in topic_list]
143 |     for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Topic'):
144 |         await task
145 | 
146 | 
147 | async def get_all_question_answer() -> None:
148 |     question_list = [i[0] for i in db.session.query(
149 |         distinct(zhihu_question.uid)).all()]
150 |     tasks = [asyncio.create_task(get_question_answer(i))
151 |              for i in question_list]
152 |     for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Question Answer'):
153 |         # await task
154 |         asyncio.run(task)
155 | 
156 | async def get_all_answer_comment() -> None:
157 |     answer_list = [i[0]
158 |                    for i in db.session.query(distinct(zhihu_answer.uid)).all()]
159 |     tasks = [asyncio.create_task(get_answer_comment(i)) for i in answer_list]
160 |     for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Answer Comment'):
161 |         # await task
162 |         asyncio.run(task)
163 | 
164 | 
165 | async def get_all_question_comment() -> None:
166 |     question_list = [i[0] for i in db.session.query(
167 |         distinct(zhihu_question.uid)).all()]
168 |     tasks = [asyncio.create_task(get_question_comment(i))
169 |              for i in question_list]
170 |     for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Question Comment'):
171 |         # await task
172 |         asyncio.run(task)
173 | 
174 | 
175 | async def get_all_article_comment() -> None:
176 |     article_list = [i[0]
177 |                     for i in db.session.query(distinct(zhihu_article.uid)).all()]
178 |     tasks = [asyncio.create_task(get_article_comment(i)) for i in article_list]
179 |     for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Article Comment'):
180 |         # await task
181 |         asyncio.run(task)
182 | 
183 | 
184 | def main():
185 |     asyncio.run(get_all_topic())
186 |     asyncio.run(get_all_question_answer())
187 |     asyncio.run(get_all_article_comment())
188 |     asyncio.run(get_all_question_comment())
189 |     asyncio.run(get_all_answer_comment())
190 | 
191 | 
192 | if __name__ == '__main__':
193 |     # save_cookies()
194 |     # exit(0)
195 |     load_all_cookies()
196 |     db = Database(
197 |         'mysql+pymysql://root:20131114@localhost:3306/zhihu?charset=utf8mb4')
198 |     # drop all table
199 |     # Base.metadata.drop_all(db.engine)
200 |     db.create_all_table()
201 | 
202 |     main()
203 |     pass
204 | 


--------------------------------------------------------------------------------
/database/__init__.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import html
  3 | from typing import Union, Iterable
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | from sqlalchemy import create_engine, String, Column, Float, Integer, DateTime, Boolean, BigInteger
  7 | from sqlalchemy.dialects.mysql import LONGTEXT
  8 | from sqlalchemy.orm import DeclarativeBase, sessionmaker
  9 | 
 10 | 
 11 | class Base(DeclarativeBase):
 12 |     pass
 13 | 
 14 | 
 15 | class Database:
 16 |     def __init__(self, db_url: str) -> None:
 17 |         self.engine = create_engine(db_url, echo=True)
 18 |         self.session = sessionmaker(bind=self.engine)()
 19 | 
 20 |     def init(self) -> None:
 21 |         self.create_all_table()
 22 | 
 23 |     def create_all_table(self) -> None:
 24 |         Base.metadata.create_all(self.engine)
 25 | 
 26 |     def inserts(self, obj: Union[Iterable[Base], Base]) -> None:
 27 |         if isinstance(obj, list):
 28 |             self.session.add_all(obj)
 29 |         else:
 30 |             self.session.add(obj)
 31 |         try:
 32 |             self.session.commit()
 33 |         except Exception as e:
 34 |             self.session.rollback()
 35 |             raise e
 36 | 
 37 |     def close(self) -> None:
 38 |         self.session.close()
 39 |         self.engine.dispose()
 40 | 
 41 | 
 42 | 
 43 | class zhihu_topic(Base):
 44 |     __tablename__ = 'zhihu_topic'
 45 |     id = Column(Integer, primary_key=True, autoincrement=True)
 46 |     uid = Column(String(255), comment='话题id')
 47 |     name = Column(String(255), comment='话题名')
 48 |     url = Column(String(255), comment='话题链接')
 49 |     created = Column(DateTime, comment='创建时间')
 50 |     updated = Column(DateTime, comment='更新时间')
 51 |     introduction = Column(LONGTEXT, comment='话题简介')
 52 |     followers = Column(BigInteger, comment='关注人数')
 53 |     questions = Column(BigInteger, comment='问题数')
 54 |     avatar_url = Column(String(255), comment='话题头像链接')
 55 |     best_answers = Column(Integer, comment='最佳回答数')
 56 | 
 57 |     def __repr__(self):
 58 |         return f'<zhihu_topic(uid={self.uid}, name={self.name}, url={self.url}, created={self.created})>'
 59 | 
 60 |     @classmethod
 61 |     def load(cls, data: dict):
 62 |         uid = data.get('id', -1)
 63 |         name = data.get('name', '')
 64 |         url = data.get('url', '')
 65 |         created = datetime.datetime.fromtimestamp(data.get('created', 0))
 66 |         updated = datetime.datetime.fromtimestamp(data.get('updated', 0))
 67 |         introduction = html.escape(data.get('introduction', ''))
 68 |         followers = data.get('followers_count', 0)
 69 |         questions = data.get('questions_count', 0)
 70 |         avatar_url = data.get('avatar_url', '')
 71 |         best_answers = data.get('best_answers_count', 0)
 72 |         return cls(uid=uid, name=name, url=url, created=created, updated=updated, introduction=introduction,
 73 |                    followers=followers, questions=questions, avatar_url=avatar_url, best_answers=best_answers)
 74 | 
 75 | 
 76 | class zhihu_user(Base):
 77 |     __tablename__ = 'zhihu_user'
 78 |     id = Column(Integer, primary_key=True, autoincrement=True)
 79 |     uid = Column(String(255), comment='用户id')
 80 |     name = Column(String(255), comment='用户名')
 81 |     gender = Column(Integer, comment='性别')
 82 |     user_type = Column(String(255), comment='用户类型')
 83 |     url = Column(LONGTEXT, comment='用户链接')
 84 |     badge = Column(String(255), comment='用户标签')
 85 | 
 86 |     def __repr__(self):
 87 |         return f'<zhihu_user(uid={self.uid}, name={self.name})>'
 88 | 
 89 |     @classmethod
 90 |     def load(cls, data: dict):
 91 |         uid = data.get('id', -1)
 92 |         name = data.get('name', '')
 93 |         gender = data.get('gender', -1)
 94 |         user_type = data.get('user_type', '')
 95 |         url = data.get('url', '')
 96 |         badge = ', '.join([i.get('description', '') for i in data.get('badge')]) if data.get('badge', None) else ''
 97 |         return cls(uid=uid, name=name, gender=gender, user_type=user_type, url=url, badge=badge)
 98 | 
 99 | 
100 | class zhihu_answer(Base):
101 |     __tablename__ = 'zhihu_answer'
102 |     id = Column(Integer, primary_key=True, autoincrement=True)
103 |     uid = Column(String(255), comment='回答id')
104 |     question_id = Column(String(255), comment='问题id')
105 |     author_id = Column(String(255), comment='作者id')
106 |     created = Column(DateTime, comment='创建时间')
107 |     updated = Column(DateTime, comment='更新时间')
108 |     voteup_count = Column(BigInteger, comment='点赞数')
109 |     comment_count = Column(BigInteger, comment='评论数')
110 |     favlists_count = Column(BigInteger, comment='收藏数')
111 |     url = Column(String(255), comment='回答链接')
112 |     content = Column(LONGTEXT, comment='回答内容')
113 |     is_label = Column(Boolean, comment='是否标注')
114 | 
115 |     def __repr__(self):
116 |         return f'<zhihu_answer(uid={self.uid}, question_id={self.question_id}, author_id={self.author_id})>'
117 | 
118 |     @classmethod
119 |     def load(cls, data: dict):
120 |         uid = data.get('id', -1)
121 |         question_id = data.get('question', {}).get('id', -1)
122 |         author_id = data.get('author', {}).get('id', -1)
123 |         created = datetime.datetime.fromtimestamp(data.get('created_time', 0))
124 |         updated = datetime.datetime.fromtimestamp(data.get('updated_time', 0))
125 |         voteup_count = data.get('voteup_count', 0)
126 |         comment_count = data.get('comment_count', 0)
127 |         favlists_count = data.get('favlists_count', 0)
128 |         url = data.get('url', '')
129 |         content = BeautifulSoup(data.get('content', ''), 'html.parser').get_text().replace('"', "'").strip()
130 |         is_label = data.get('is_label', False)
131 |         return cls(uid=uid, question_id=question_id, author_id=author_id, created=created, updated=updated,
132 |                    voteup_count=voteup_count, comment_count=comment_count, favlists_count=favlists_count, url=url,
133 |                    content=content, is_label=is_label)
134 | 
135 | 
136 | class zhihu_article(Base):
137 |     __tablename__ = 'zhihu_article'
138 |     id = Column(Integer, primary_key=True, autoincrement=True)
139 |     uid = Column(String(255), comment='文章id')
140 |     author_id = Column(String(255), comment='作者id')
141 |     url = Column(String(255), comment='文章链接')
142 |     title = Column(String(255), comment='文章标题')
143 |     created = Column(DateTime, comment='创建时间')
144 |     updated = Column(DateTime, comment='更新时间')
145 |     voteup_count = Column(BigInteger, comment='点赞数')
146 |     comment_count = Column(BigInteger, comment='评论数')
147 |     favlists_count = Column(BigInteger, comment='收藏数')
148 |     content = Column(LONGTEXT, comment='文章内容')
149 |     is_label = Column(Boolean, comment='是否标注')
150 | 
151 |     def __repr__(self):
152 |         return f'<zhihu_article(uid={self.uid}, author_id={self.author_id}, title={self.title})>'
153 | 
154 |     @classmethod
155 |     def load(cls, data: dict):
156 |         uid = data.get('id', -1)
157 |         author_id = data.get('author', {}).get('id', -1)
158 |         url = data.get('url', '')
159 |         title = data.get('title', '')
160 |         created = datetime.datetime.fromtimestamp(data.get('created', 0))
161 |         updated = datetime.datetime.fromtimestamp(data.get('updated', 0))
162 |         voteup_count = data.get('voteup_count', 0)
163 |         comment_count = data.get('comment_count', 0)
164 |         favlists_count = data.get('favlists_count', 0)
165 |         content = BeautifulSoup(data.get('content', ''), 'html.parser').get_text().replace('"', "'").strip()
166 |         is_label = data.get('is_label', False)
167 | 
168 |         return cls(uid=uid, author_id=author_id, url=url, title=title, created=created, updated=updated,
169 |                    voteup_count=voteup_count, comment_count=comment_count, favlists_count=favlists_count,
170 |                    content=content, is_label=is_label)
171 | 
172 | 
173 | class zhihu_question(Base):
174 |     __tablename__ = 'zhihu_question'
175 |     id = Column(Integer, primary_key=True, autoincrement=True)
176 |     uid = Column(String(255), comment='问题id')
177 |     author_id = Column(String(255), comment='作者id')
178 |     title = Column(String(255), comment='问题标题')
179 |     created = Column(DateTime, comment='创建时间')
180 |     updated = Column(DateTime, comment='更新时间')
181 |     question_type = Column(String(255), comment='问题类型')
182 | 
183 |     def __repr__(self):
184 |         return f'<zhihu_question(uid={self.uid}, author_id={self.author_id}, title={self.title})>'
185 | 
186 |     @classmethod
187 |     def load(cls, data: dict):
188 |         uid = data.get('id', -1)
189 |         author_id = data.get('author', {}).get('id', -1)
190 |         title = data.get('title', '')
191 |         created = datetime.datetime.fromtimestamp(data.get('created_time', 0))
192 |         updated = datetime.datetime.fromtimestamp(data.get('updated_time', 0))
193 |         question_type = data.get('question_type', '')
194 |         return cls(uid=uid, author_id=author_id, title=title, created=created, updated=updated,
195 |                    question_type=question_type)
196 | 
197 | 
198 | class zhihu_comment(Base):
199 |     __tablename__ = 'zhihu_comment'
200 |     id = Column(Integer, primary_key=True, autoincrement=True)
201 |     uid = Column(String(255), comment='评论id')
202 |     author_id = Column(String(255), comment='作者id')
203 |     created = Column(DateTime, comment='创建时间')
204 |     content = Column(LONGTEXT, comment='评论内容')
205 |     vote_count = Column(BigInteger, comment='点赞数')
206 |     reply_to_author_id = Column(String(255), comment='回复对象id')
207 |     url = Column(String(255), comment='评论链接')
208 |     address = Column(String(255), comment='评论地址')
209 |     from_where = Column(String(255), comment='评论来源')
210 | 
211 |     def __repr__(self):
212 |         return f'<zhihu_comment(uid={self.uid}, author_id={self.author_id}, content={self.content})>'
213 | 
214 |     @classmethod
215 |     def load(cls, data: dict):
216 |         uid = data.get('id', -1)
217 |         author_id = data.get('author', {}).get('member', {}).get('id', -1)
218 |         created = datetime.datetime.fromtimestamp(data.get('created_time', 0))
219 |         content = BeautifulSoup(data.get('content', ''), 'html.parser').get_text().replace('"', "'").strip()
220 |         vote_count = data.get('vote_count', 0)
221 |         reply_to_author_id = data.get('reply_to_author', {}).get('member', {}).get('id', -1)
222 |         url = data.get('url', '')
223 |         address = data.get('address_text', '')
224 |         from_where = data.get('from_where', '')
225 |         return cls(uid=uid, author_id=author_id, created=created, content=content, vote_count=vote_count,
226 |                    reply_to_author_id=reply_to_author_id, url=url, address=address, from_where=from_where)
227 | 
228 | 
229 | if __name__ == '__main__':
230 |     db = Database('mysql+pymysql://root:20131114@localhost:3306/env?charset=utf8mb4')
231 |     db.create_all_table()
232 | 


--------------------------------------------------------------------------------