├── requirements.txt ├── README.md ├── auth.py ├── repeat.py ├── utils.py ├── connection └── __init__.py ├── .gitignore ├── main.py └── database └── __init__.py /requirements.txt: -------------------------------------------------------------------------------- 1 | httpx 2 | selenium 3 | tqdm 4 | sqlalchemy 5 | bs4 6 | pymysql -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ZhiHu_Spider 2 | 知乎爬虫 3 | 4 | 用于爬取知乎页面 **话题** **问题** **回答** **评论** 的爬虫 5 | 6 | - **支持 asyncio 异步高并发** 7 | - **支持多用户登陆** 8 | -------------------------------------------------------------------------------- /auth.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from utils import load_all_cookies, cookies_list 4 | 5 | from selenium import webdriver 6 | 7 | 8 | def main(): 9 | driver = webdriver.Chrome() 10 | load_all_cookies() 11 | for cookies in cookies_list: 12 | driver.get('https://www.zhihu.com/') 13 | for key, value in cookies.items(): 14 | driver.add_cookie({'name': key, 'value': value}) 15 | 16 | time.sleep(1) 17 | driver.get('https://www.zhihu.com/') 18 | pass 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /repeat.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import func, Table, MetaData 2 | from database import Database, zhihu_question, zhihu_answer, zhihu_article, zhihu_topic, zhihu_user, zhihu_comment 3 | 4 | 5 | def distinct(table): 6 | dump = (db.session.query(table.uid).having(func.count(table.uid) > 1) 7 | .group_by(table.uid).all()) 8 | 9 | for row in dump: 10 | rows = db.session.query(table).filter( 11 | table.uid == row[0]).all() 12 | for i in rows[1:]: 13 | db.session.delete(i) 14 | db.session.commit() 15 | pass 16 | 17 | if __name__ == '__main__': 18 | db = Database( 19 | 'mysql+pymysql://root:20131114@localhost:3306/zhihu?charset=utf8mb4') 20 | distinct(zhihu_answer) 21 | distinct(zhihu_article) 22 | distinct(zhihu_comment) 23 | distinct(zhihu_question) 24 | distinct(zhihu_topic) 25 | distinct(zhihu_user) 26 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | import random 4 | 5 | import httpx 6 | from tqdm import tqdm 7 | from selenium import webdriver 8 | 9 | cookies_list = [] 10 | cookies_path = pathlib.Path('./cookies/') 11 | cookies_path.mkdir(exist_ok=True) 12 | 13 | 14 | idx = 0 15 | 16 | def remove_cookies(cookies: httpx.Cookies) -> None: 17 | global cookies_list, idx 18 | # cookies_list.remove(cookies) 19 | tqdm.write(f'Remove cookies {cookies.file_name}') 20 | idx %= len(cookies_list) 21 | 22 | 23 | def load_all_cookies() -> None: 24 | global cookies_list 25 | for cookies in pathlib.Path(cookies_path).glob('*.json'): 26 | with open(cookies, 'r', encoding='u8') as f: 27 | s = json.load(f) 28 | httpx_cookies = httpx.Cookies() 29 | for key in s: 30 | httpx_cookies.set(key['name'], key['value'], domain=key['domain'], path=key['path']) 31 | httpx_cookies.file_name = cookies.stem 32 | cookies_list.append(httpx_cookies) 33 | 34 | 35 | def save_cookies() -> None: 36 | def auto_increment() -> int: 37 | cookies = list(pathlib.Path(cookies_path).glob('*.json')) 38 | if not cookies: 39 | return 1 40 | return max([int(i.stem) for i in cookies]) + 1 41 | 42 | driver = webdriver.Chrome() 43 | driver.get('https://www.zhihu.com/signin?next=%2F') 44 | while True: 45 | if driver.current_url == 'https://www.zhihu.com/': 46 | break 47 | cookies = driver.get_cookies() 48 | with open(cookies_path / f'{auto_increment()}.json', 'w') as f: 49 | json.dump(cookies, f, ensure_ascii=False) 50 | 51 | 52 | def get_random_cookies() -> httpx.Cookies: 53 | if len(cookies_list) == 0: 54 | raise NotImplementedError('No cookies') 55 | global idx 56 | idx %= len(cookies_list) 57 | cookies = cookies_list[idx] 58 | idx += 1 59 | return cookies 60 | -------------------------------------------------------------------------------- /connection/__init__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | from urllib.parse import urlencode 4 | 5 | import httpx 6 | from tqdm import tqdm 7 | 8 | from utils import get_random_cookies, remove_cookies 9 | 10 | limit = asyncio.Semaphore(5) 11 | 12 | 13 | class API: 14 | base_url = 'https://api.zhihu.com/' 15 | v4_url = 'https://www.zhihu.com/api/v4/' 16 | v5_url = 'https://www.zhihu.com/api/v5/' 17 | 18 | async def get(self, msg: str, types: str = 'base') -> httpx.Response: 19 | if types == 'base': 20 | url = self.base_url 21 | elif types == 'v4': 22 | url = self.v4_url 23 | elif types == 'v5': 24 | url = self.v5_url 25 | else: 26 | raise ValueError('types must be "base", "v4" or "v5"') 27 | try: 28 | cookies = get_random_cookies() 29 | async with limit: 30 | async with httpx.AsyncClient(follow_redirects=True, cookies=cookies, timeout=3) as client: 31 | resp = await client.get(url + msg) 32 | await asyncio.sleep(0.3) 33 | if resp.status_code == 403: 34 | remove_cookies(cookies) 35 | raise ConnectionRefusedError 36 | assert resp.status_code == 200 37 | except ConnectionRefusedError: 38 | # 输出状态码 39 | tqdm.write(f'\tError: {resp.status_code} {url + msg}') 40 | return await self.get(msg, types) 41 | except httpx.ConnectTimeout: 42 | return await self.get(msg, types) 43 | except Exception as e: 44 | # 输出错误类型 45 | tqdm.write(f'\tError: {type(e)} {url + msg}') 46 | return await self.get(msg, types) 47 | return resp 48 | 49 | async def get_topic(self, ids: int, path: str, arg=None, 50 | types: str = 'base') -> httpx.Response: 51 | if arg is None: 52 | arg = {} 53 | msg = f'topics/{ids}' + path + \ 54 | (('?' + urlencode(arg)) if urlencode(arg) else '') 55 | return await self.get(msg, types=types) 56 | 57 | async def get_article(self, ids: int, path: str, arg=None, 58 | types: str = 'base') -> httpx.Response: 59 | if arg is None: 60 | arg = {} 61 | msg = f'articles/{ids}' + path + \ 62 | (('?' + urlencode(arg)) if urlencode(arg) else '') 63 | return await self.get(msg, types=types) 64 | 65 | async def get_answer(self, ids: int, path: str, arg=None, 66 | types: str = 'base') -> httpx.Response: 67 | if arg is None: 68 | arg = {} 69 | msg = f'answers/{ids}' + path + \ 70 | (('?' + urlencode(arg)) if urlencode(arg) else '') 71 | return await self.get(msg, types=types) 72 | 73 | async def get_question(self, ids: int, path: str, arg=None, 74 | types: str = 'base') -> httpx.Response: 75 | if arg is None: 76 | arg = {} 77 | msg = f'questions/{ids}' + path + \ 78 | (('?' + urlencode(arg)) if urlencode(arg) else '') 79 | return await self.get(msg, types=types) 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | 162 | *.json -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from sqlalchemy import distinct 4 | from tqdm import tqdm 5 | 6 | from connection import API 7 | from database import * 8 | from utils import load_all_cookies, save_cookies 9 | 10 | api = API() 11 | 12 | 13 | async def get_topic(topic_id: int) -> None: 14 | info = ( 15 | await api.get_topic(topic_id, '', arg={'include': 'created,updated'}, types='v5')).json() 16 | db.inserts(zhihu_topic.load(info)) 17 | 18 | num = (await api.get_topic(topic_id, '/feeds/essence', arg={'limit': 1})).json().get('paging', 19 | {}).get( 20 | 'totals', 0) 21 | for i in range(0, num, 20): 22 | data = (await api.get_topic(topic_id, '/feeds/essence', {'limit': 20, 'offset': i}, 23 | types='v5')).json() 24 | if not data.get('data', None): 25 | break 26 | data = data.get('data') 27 | answer_list = [] 28 | article_list = [] 29 | user_list = [] 30 | question_list = [] 31 | for j in data: 32 | target = j.get('target') 33 | user_list.append(target.get('author')) 34 | types = target.get('type') 35 | if types == 'answer': 36 | question_list.append(target.get('question')) 37 | answer_list.append(target) 38 | elif types == 'article': 39 | article_list.append(target) 40 | 41 | if answer_list: 42 | db.inserts([zhihu_answer.load(i) for i in answer_list]) 43 | if article_list: 44 | db.inserts([zhihu_article.load(i) for i in article_list]) 45 | if user_list: 46 | db.inserts([zhihu_user.load(i) for i in user_list]) 47 | if question_list: 48 | db.inserts([zhihu_question.load(i) for i in question_list]) 49 | 50 | 51 | async def get_question_answer(question_id: int) -> None: 52 | num = (await api.get_question(question_id, '/answers', arg={'limit': 1})).json().get('paging', 53 | {}).get( 54 | 'totals', 0) 55 | for j in range(0, num, 20): 56 | arg = {'limit': 20, 'offset': j, 57 | 'include': 'content,voteup_count,favlists_count,comment_count,is_labeled'} 58 | data = (await api.get_question(question_id, '/answers', arg=arg)).json() 59 | if not data.get('data', None): 60 | break 61 | data = data.get('data') 62 | answer_list = [] 63 | user_list = [] 64 | for k in data: 65 | user_list.append(k.get('author')) 66 | answer_list.append(k) 67 | 68 | if answer_list: 69 | db.inserts([zhihu_answer.load(i) for i in answer_list]) 70 | if user_list: 71 | db.inserts([zhihu_user.load(i) for i in user_list]) 72 | 73 | 74 | async def get_answer_comment(answer_id: int) -> None: 75 | num = (await api.get_answer(answer_id, '/comments', arg={'limit': 1})).json().get('paging', 76 | {}).get( 77 | 'totals', 0) 78 | for j in range(0, num, 20): 79 | arg = {'limit': 20, 'offset': j} 80 | data = (await api.get_answer(answer_id, '/comments', arg=arg)).json() 81 | if not data.get('data', None): 82 | break 83 | data = data.get('data') 84 | comment_list = [] 85 | user_list = [] 86 | for k in data: 87 | user_list.append(k.get('author', {}).get('member', {})) 88 | comment_list.append(k) 89 | 90 | if comment_list: 91 | db.inserts([zhihu_comment.load(i) for i in comment_list]) 92 | if user_list: 93 | db.inserts([zhihu_user.load(i) for i in user_list]) 94 | 95 | 96 | async def get_question_comment(question_id: int) -> None: 97 | num = (await api.get_question(question_id, '/comments', arg={'limit': 1})).json().get( 98 | 'paging', {}).get('totals', 0) 99 | for j in range(0, num, 20): 100 | arg = {'limit': 20, 'offset': j} 101 | data = (await api.get_question(question_id, '/comments', arg=arg)).json() 102 | if not data.get('data', None): 103 | break 104 | data = data.get('data') 105 | comment_list = [] 106 | user_list = [] 107 | for k in data: 108 | user_list.append(k.get('author', {}).get('member', {})) 109 | comment_list.append(k) 110 | 111 | if comment_list: 112 | db.inserts([zhihu_comment.load(i) for i in comment_list]) 113 | if user_list: 114 | db.inserts([zhihu_user.load(i) for i in user_list]) 115 | 116 | 117 | async def get_article_comment(article_id: int) -> None: 118 | num = (await api.get_article(article_id, '/comments', arg={'limit': 1})).json().get('paging', 119 | {}).get( 120 | 'totals', 0) 121 | for j in range(0, num, 20): 122 | arg = {'limit': 20, 'offset': j} 123 | data = (await api.get_article(article_id, '/comments', arg=arg)).json() 124 | if not data.get('data', None): 125 | break 126 | data = data.get('data') 127 | comment_list = [] 128 | user_list = [] 129 | for k in data: 130 | user_list.append(k.get('author', {}).get('member', {})) 131 | comment_list.append(k) 132 | 133 | if comment_list: 134 | db.inserts([zhihu_comment.load(i) for i in comment_list]) 135 | if user_list: 136 | db.inserts([zhihu_user.load(i) for i in user_list]) 137 | 138 | 139 | async def get_all_topic() -> None: 140 | topic_list = [23507285, 26640843, 27795532, 20205523, 25671250, 23560902, 21763228] 141 | # topic_list = [23507285] 142 | tasks = [asyncio.create_task(get_topic(i)) for i in topic_list] 143 | for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Topic'): 144 | await task 145 | 146 | 147 | async def get_all_question_answer() -> None: 148 | question_list = [i[0] for i in db.session.query( 149 | distinct(zhihu_question.uid)).all()] 150 | tasks = [asyncio.create_task(get_question_answer(i)) 151 | for i in question_list] 152 | for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Question Answer'): 153 | # await task 154 | asyncio.run(task) 155 | 156 | async def get_all_answer_comment() -> None: 157 | answer_list = [i[0] 158 | for i in db.session.query(distinct(zhihu_answer.uid)).all()] 159 | tasks = [asyncio.create_task(get_answer_comment(i)) for i in answer_list] 160 | for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Answer Comment'): 161 | # await task 162 | asyncio.run(task) 163 | 164 | 165 | async def get_all_question_comment() -> None: 166 | question_list = [i[0] for i in db.session.query( 167 | distinct(zhihu_question.uid)).all()] 168 | tasks = [asyncio.create_task(get_question_comment(i)) 169 | for i in question_list] 170 | for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Question Comment'): 171 | # await task 172 | asyncio.run(task) 173 | 174 | 175 | async def get_all_article_comment() -> None: 176 | article_list = [i[0] 177 | for i in db.session.query(distinct(zhihu_article.uid)).all()] 178 | tasks = [asyncio.create_task(get_article_comment(i)) for i in article_list] 179 | for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Article Comment'): 180 | # await task 181 | asyncio.run(task) 182 | 183 | 184 | def main(): 185 | asyncio.run(get_all_topic()) 186 | asyncio.run(get_all_question_answer()) 187 | asyncio.run(get_all_article_comment()) 188 | asyncio.run(get_all_question_comment()) 189 | asyncio.run(get_all_answer_comment()) 190 | 191 | 192 | if __name__ == '__main__': 193 | # save_cookies() 194 | # exit(0) 195 | load_all_cookies() 196 | db = Database( 197 | 'mysql+pymysql://root:20131114@localhost:3306/zhihu?charset=utf8mb4') 198 | # drop all table 199 | # Base.metadata.drop_all(db.engine) 200 | db.create_all_table() 201 | 202 | main() 203 | pass 204 | -------------------------------------------------------------------------------- /database/__init__.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import html 3 | from typing import Union, Iterable 4 | 5 | from bs4 import BeautifulSoup 6 | from sqlalchemy import create_engine, String, Column, Float, Integer, DateTime, Boolean, BigInteger 7 | from sqlalchemy.dialects.mysql import LONGTEXT 8 | from sqlalchemy.orm import DeclarativeBase, sessionmaker 9 | 10 | 11 | class Base(DeclarativeBase): 12 | pass 13 | 14 | 15 | class Database: 16 | def __init__(self, db_url: str) -> None: 17 | self.engine = create_engine(db_url, echo=True) 18 | self.session = sessionmaker(bind=self.engine)() 19 | 20 | def init(self) -> None: 21 | self.create_all_table() 22 | 23 | def create_all_table(self) -> None: 24 | Base.metadata.create_all(self.engine) 25 | 26 | def inserts(self, obj: Union[Iterable[Base], Base]) -> None: 27 | if isinstance(obj, list): 28 | self.session.add_all(obj) 29 | else: 30 | self.session.add(obj) 31 | try: 32 | self.session.commit() 33 | except Exception as e: 34 | self.session.rollback() 35 | raise e 36 | 37 | def close(self) -> None: 38 | self.session.close() 39 | self.engine.dispose() 40 | 41 | 42 | 43 | class zhihu_topic(Base): 44 | __tablename__ = 'zhihu_topic' 45 | id = Column(Integer, primary_key=True, autoincrement=True) 46 | uid = Column(String(255), comment='话题id') 47 | name = Column(String(255), comment='话题名') 48 | url = Column(String(255), comment='话题链接') 49 | created = Column(DateTime, comment='创建时间') 50 | updated = Column(DateTime, comment='更新时间') 51 | introduction = Column(LONGTEXT, comment='话题简介') 52 | followers = Column(BigInteger, comment='关注人数') 53 | questions = Column(BigInteger, comment='问题数') 54 | avatar_url = Column(String(255), comment='话题头像链接') 55 | best_answers = Column(Integer, comment='最佳回答数') 56 | 57 | def __repr__(self): 58 | return f'' 59 | 60 | @classmethod 61 | def load(cls, data: dict): 62 | uid = data.get('id', -1) 63 | name = data.get('name', '') 64 | url = data.get('url', '') 65 | created = datetime.datetime.fromtimestamp(data.get('created', 0)) 66 | updated = datetime.datetime.fromtimestamp(data.get('updated', 0)) 67 | introduction = html.escape(data.get('introduction', '')) 68 | followers = data.get('followers_count', 0) 69 | questions = data.get('questions_count', 0) 70 | avatar_url = data.get('avatar_url', '') 71 | best_answers = data.get('best_answers_count', 0) 72 | return cls(uid=uid, name=name, url=url, created=created, updated=updated, introduction=introduction, 73 | followers=followers, questions=questions, avatar_url=avatar_url, best_answers=best_answers) 74 | 75 | 76 | class zhihu_user(Base): 77 | __tablename__ = 'zhihu_user' 78 | id = Column(Integer, primary_key=True, autoincrement=True) 79 | uid = Column(String(255), comment='用户id') 80 | name = Column(String(255), comment='用户名') 81 | gender = Column(Integer, comment='性别') 82 | user_type = Column(String(255), comment='用户类型') 83 | url = Column(LONGTEXT, comment='用户链接') 84 | badge = Column(String(255), comment='用户标签') 85 | 86 | def __repr__(self): 87 | return f'' 88 | 89 | @classmethod 90 | def load(cls, data: dict): 91 | uid = data.get('id', -1) 92 | name = data.get('name', '') 93 | gender = data.get('gender', -1) 94 | user_type = data.get('user_type', '') 95 | url = data.get('url', '') 96 | badge = ', '.join([i.get('description', '') for i in data.get('badge')]) if data.get('badge', None) else '' 97 | return cls(uid=uid, name=name, gender=gender, user_type=user_type, url=url, badge=badge) 98 | 99 | 100 | class zhihu_answer(Base): 101 | __tablename__ = 'zhihu_answer' 102 | id = Column(Integer, primary_key=True, autoincrement=True) 103 | uid = Column(String(255), comment='回答id') 104 | question_id = Column(String(255), comment='问题id') 105 | author_id = Column(String(255), comment='作者id') 106 | created = Column(DateTime, comment='创建时间') 107 | updated = Column(DateTime, comment='更新时间') 108 | voteup_count = Column(BigInteger, comment='点赞数') 109 | comment_count = Column(BigInteger, comment='评论数') 110 | favlists_count = Column(BigInteger, comment='收藏数') 111 | url = Column(String(255), comment='回答链接') 112 | content = Column(LONGTEXT, comment='回答内容') 113 | is_label = Column(Boolean, comment='是否标注') 114 | 115 | def __repr__(self): 116 | return f'' 117 | 118 | @classmethod 119 | def load(cls, data: dict): 120 | uid = data.get('id', -1) 121 | question_id = data.get('question', {}).get('id', -1) 122 | author_id = data.get('author', {}).get('id', -1) 123 | created = datetime.datetime.fromtimestamp(data.get('created_time', 0)) 124 | updated = datetime.datetime.fromtimestamp(data.get('updated_time', 0)) 125 | voteup_count = data.get('voteup_count', 0) 126 | comment_count = data.get('comment_count', 0) 127 | favlists_count = data.get('favlists_count', 0) 128 | url = data.get('url', '') 129 | content = BeautifulSoup(data.get('content', ''), 'html.parser').get_text().replace('"', "'").strip() 130 | is_label = data.get('is_label', False) 131 | return cls(uid=uid, question_id=question_id, author_id=author_id, created=created, updated=updated, 132 | voteup_count=voteup_count, comment_count=comment_count, favlists_count=favlists_count, url=url, 133 | content=content, is_label=is_label) 134 | 135 | 136 | class zhihu_article(Base): 137 | __tablename__ = 'zhihu_article' 138 | id = Column(Integer, primary_key=True, autoincrement=True) 139 | uid = Column(String(255), comment='文章id') 140 | author_id = Column(String(255), comment='作者id') 141 | url = Column(String(255), comment='文章链接') 142 | title = Column(String(255), comment='文章标题') 143 | created = Column(DateTime, comment='创建时间') 144 | updated = Column(DateTime, comment='更新时间') 145 | voteup_count = Column(BigInteger, comment='点赞数') 146 | comment_count = Column(BigInteger, comment='评论数') 147 | favlists_count = Column(BigInteger, comment='收藏数') 148 | content = Column(LONGTEXT, comment='文章内容') 149 | is_label = Column(Boolean, comment='是否标注') 150 | 151 | def __repr__(self): 152 | return f'' 153 | 154 | @classmethod 155 | def load(cls, data: dict): 156 | uid = data.get('id', -1) 157 | author_id = data.get('author', {}).get('id', -1) 158 | url = data.get('url', '') 159 | title = data.get('title', '') 160 | created = datetime.datetime.fromtimestamp(data.get('created', 0)) 161 | updated = datetime.datetime.fromtimestamp(data.get('updated', 0)) 162 | voteup_count = data.get('voteup_count', 0) 163 | comment_count = data.get('comment_count', 0) 164 | favlists_count = data.get('favlists_count', 0) 165 | content = BeautifulSoup(data.get('content', ''), 'html.parser').get_text().replace('"', "'").strip() 166 | is_label = data.get('is_label', False) 167 | 168 | return cls(uid=uid, author_id=author_id, url=url, title=title, created=created, updated=updated, 169 | voteup_count=voteup_count, comment_count=comment_count, favlists_count=favlists_count, 170 | content=content, is_label=is_label) 171 | 172 | 173 | class zhihu_question(Base): 174 | __tablename__ = 'zhihu_question' 175 | id = Column(Integer, primary_key=True, autoincrement=True) 176 | uid = Column(String(255), comment='问题id') 177 | author_id = Column(String(255), comment='作者id') 178 | title = Column(String(255), comment='问题标题') 179 | created = Column(DateTime, comment='创建时间') 180 | updated = Column(DateTime, comment='更新时间') 181 | question_type = Column(String(255), comment='问题类型') 182 | 183 | def __repr__(self): 184 | return f'' 185 | 186 | @classmethod 187 | def load(cls, data: dict): 188 | uid = data.get('id', -1) 189 | author_id = data.get('author', {}).get('id', -1) 190 | title = data.get('title', '') 191 | created = datetime.datetime.fromtimestamp(data.get('created_time', 0)) 192 | updated = datetime.datetime.fromtimestamp(data.get('updated_time', 0)) 193 | question_type = data.get('question_type', '') 194 | return cls(uid=uid, author_id=author_id, title=title, created=created, updated=updated, 195 | question_type=question_type) 196 | 197 | 198 | class zhihu_comment(Base): 199 | __tablename__ = 'zhihu_comment' 200 | id = Column(Integer, primary_key=True, autoincrement=True) 201 | uid = Column(String(255), comment='评论id') 202 | author_id = Column(String(255), comment='作者id') 203 | created = Column(DateTime, comment='创建时间') 204 | content = Column(LONGTEXT, comment='评论内容') 205 | vote_count = Column(BigInteger, comment='点赞数') 206 | reply_to_author_id = Column(String(255), comment='回复对象id') 207 | url = Column(String(255), comment='评论链接') 208 | address = Column(String(255), comment='评论地址') 209 | from_where = Column(String(255), comment='评论来源') 210 | 211 | def __repr__(self): 212 | return f'' 213 | 214 | @classmethod 215 | def load(cls, data: dict): 216 | uid = data.get('id', -1) 217 | author_id = data.get('author', {}).get('member', {}).get('id', -1) 218 | created = datetime.datetime.fromtimestamp(data.get('created_time', 0)) 219 | content = BeautifulSoup(data.get('content', ''), 'html.parser').get_text().replace('"', "'").strip() 220 | vote_count = data.get('vote_count', 0) 221 | reply_to_author_id = data.get('reply_to_author', {}).get('member', {}).get('id', -1) 222 | url = data.get('url', '') 223 | address = data.get('address_text', '') 224 | from_where = data.get('from_where', '') 225 | return cls(uid=uid, author_id=author_id, created=created, content=content, vote_count=vote_count, 226 | reply_to_author_id=reply_to_author_id, url=url, address=address, from_where=from_where) 227 | 228 | 229 | if __name__ == '__main__': 230 | db = Database('mysql+pymysql://root:20131114@localhost:3306/env?charset=utf8mb4') 231 | db.create_all_table() 232 | --------------------------------------------------------------------------------