├── .gitignore ├── .travis.yml ├── Api.md ├── LICENSE ├── MANIFEST ├── Pipfile ├── Pipfile.lock ├── README.md ├── _config.yml ├── asserts ├── jetbrains.svg └── weibo_tweets.png ├── packing_and_upload.sh ├── persistence ├── __init__.py └── persistence.py ├── samples ├── __init__.py └── weibo_flasgger │ ├── FLASGGER_README.md │ ├── __init__.py │ ├── flasgger_api.py │ ├── imgs │ └── weibo-flasgger.png │ └── ymls │ ├── exist_get_uid.yml │ ├── get_weibo_containerid.yml │ ├── search_by_name.yml │ ├── weibo_getIndex.yml │ └── weibo_tweets.yml ├── setup.py ├── tests ├── __init__.py └── test_weibo_scraper.py ├── weibo_base ├── __init__.py ├── weibo_api.py ├── weibo_component.py ├── weibo_exception.py ├── weibo_parser.py ├── weibo_typing.py └── weibo_util.py ├── weibo_scraper.py └── weibo_scraper_cli.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python,django,intellij,pycharm 3 | 4 | ### Django ### 5 | *.log 6 | *.pot 7 | *.pyc 8 | __pycache__/ 9 | local_settings.py 10 | db.sqlite3 11 | media 12 | 13 | # If your build process includes running collectstatic, then you probably don't need or want to include staticfiles/ 14 | # in your Git repository. Update and uncomment the following line accordingly. 15 | # /staticfiles/ 16 | 17 | ### Intellij ### 18 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 19 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 20 | 21 | # User-specific stuff: 22 | .idea/**/workspace.xml 23 | .idea/**/tasks.xml 24 | .idea/dictionaries 25 | 26 | # Sensitive or high-churn files: 27 | .idea/**/dataSources/ 28 | .idea/**/dataSources.ids 29 | .idea/**/dataSources.xml 30 | .idea/**/dataSources.local.xml 31 | .idea/**/sqlDataSources.xml 32 | .idea/**/dynamic.xml 33 | .idea/**/uiDesigner.xml 34 | 35 | # Gradle: 36 | .idea/**/gradle.xml 37 | .idea/**/libraries 38 | 39 | # CMake 40 | cmake-build-debug/ 41 | 42 | # Mongo Explorer plugin: 43 | .idea/**/mongoSettings.xml 44 | 45 | ## File-based project format: 46 | *.iws 47 | 48 | ## Plugin-specific files: 49 | 50 | # IntelliJ 51 | /out/ 52 | 53 | # mpeltonen/sbt-idea plugin 54 | .idea_modules/ 55 | 56 | # JIRA plugin 57 | atlassian-ide-plugin.xml 58 | 59 | # Cursive Clojure plugin 60 | .idea/replstate.xml 61 | 62 | # Ruby plugin and RubyMine 63 | /.rakeTasks 64 | 65 | # Crashlytics plugin (for Android Studio and IntelliJ) 66 | com_crashlytics_export_strings.xml 67 | crashlytics.properties 68 | crashlytics-build.properties 69 | fabric.properties 70 | 71 | ### Intellij Patch ### 72 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 73 | 74 | # *.iml 75 | # modules.xml 76 | # .idea/misc.xml 77 | # *.ipr 78 | 79 | # Sonarlint plugin 80 | .idea/sonarlint 81 | 82 | ### PyCharm ### 83 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 84 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 85 | 86 | # User-specific stuff: 87 | 88 | # Sensitive or high-churn files: 89 | 90 | # Gradle: 91 | 92 | # CMake 93 | 94 | # Mongo Explorer plugin: 95 | 96 | ## File-based project format: 97 | 98 | ## Plugin-specific files: 99 | 100 | # IntelliJ 101 | 102 | # mpeltonen/sbt-idea plugin 103 | 104 | # JIRA plugin 105 | 106 | # Cursive Clojure plugin 107 | 108 | # Ruby plugin and RubyMine 109 | 110 | # Crashlytics plugin (for Android Studio and IntelliJ) 111 | 112 | ### PyCharm Patch ### 113 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 114 | 115 | # *.iml 116 | # modules.xml 117 | # .idea/misc.xml 118 | # *.ipr 119 | 120 | # Sonarlint plugin 121 | 122 | ### Python ### 123 | # Byte-compiled / optimized / DLL files 124 | *.py[cod] 125 | *$py.class 126 | 127 | # C extensions 128 | *.so 129 | 130 | # Distribution / packaging 131 | .Python 132 | build/ 133 | develop-eggs/ 134 | dist/ 135 | downloads/ 136 | eggs/ 137 | .eggs/ 138 | lib/ 139 | lib64/ 140 | parts/ 141 | sdist/ 142 | var/ 143 | wheels/ 144 | *.egg-info/ 145 | .installed.cfg 146 | *.egg 147 | 148 | # PyInstaller 149 | # Usually these files are written by a python script from a template 150 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 151 | *.manifest 152 | *.spec 153 | 154 | # Installer logs 155 | pip-log.txt 156 | pip-delete-this-directory.txt 157 | 158 | # Unit test / coverage reports 159 | htmlcov/ 160 | .tox/ 161 | .coverage 162 | .coverage.* 163 | .cache 164 | .pytest_cache/ 165 | nosetests.xml 166 | coverage.xml 167 | *.cover 168 | .hypothesis/ 169 | 170 | # Translations 171 | *.mo 172 | 173 | # Flask stuff: 174 | instance/ 175 | .webassets-cache 176 | 177 | # Scrapy stuff: 178 | .scrapy 179 | 180 | # Sphinx documentation 181 | docs/_build/ 182 | 183 | # PyBuilder 184 | target/ 185 | 186 | # Jupyter Notebook 187 | .ipynb_checkpoints 188 | 189 | # pyenv 190 | .python-version 191 | 192 | # celery beat schedule file 193 | celerybeat-schedule.* 194 | 195 | # SageMath parsed files 196 | *.sage.py 197 | 198 | # Environments 199 | .env 200 | .venv 201 | env/ 202 | venv/ 203 | ENV/ 204 | env.bak/ 205 | venv.bak/ 206 | 207 | # Spyder project settings 208 | .spyderproject 209 | .spyproject 210 | 211 | # Rope project settings 212 | .ropeproject 213 | 214 | # mkdocs documentation 215 | /site 216 | 217 | # mypy 218 | .mypy_cache/ 219 | 220 | 221 | # End of https://www.gitignore.io/api/python,django,intellij,pycharm -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | matrix: 3 | include: 4 | - python: 3.6 5 | - python: 3.7-dev 6 | dist: xenial 7 | sudo: true 8 | install: 9 | - pip install codecov 10 | - pip install -r requirements.txt 11 | script: 12 | - coverage run test_weibo_scraper.py 13 | after_success: 14 | - codecov 15 | -------------------------------------------------------------------------------- /Api.md: -------------------------------------------------------------------------------- 1 | 2 | 通过微博名称获取微博 3 | * weibo.get_weibo_tweets_by_name 4 | 5 | ```json 6 | 7 | ``` 8 | 9 | 通过 tweet_container_id 获取微博 10 | * get_weibo_tweets 11 | ```json 12 | 13 | ``` 14 | 15 | 通过微博名查询格式化的微博(包含评论) 16 | 17 | * get_formatted_weibo_tweets_by_name 18 | ```json 19 | ``` 20 | 21 | 通过containerId查询格式化的微博(包含评论) 22 | 23 | * get_weibo_tweets_formatted 24 | 25 | 26 | 获取微博用户信息 27 | * weibo_get_index_parser 28 | * get_weibo_profile 29 | 30 | 获取关注者和被关注者 31 | 32 | * get_follows_and_followers 33 | 34 | 35 | 获取被关注者 36 | * get_follows 37 | 38 | 获取粉丝 39 | * get_followers 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # Include the README 2 | include *.md 3 | 4 | # Include the license file 5 | include LICENSE.txt 6 | 7 | # Include the data files 8 | recursive-include data * -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://mirrors.aliyun.com/pypi/simple/" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | requests = { extras = ['socks'] } 8 | twine = "*" 9 | pytest = "*" 10 | requests-html = "*" 11 | flask = "*" 12 | flasgger = "*" 13 | prompt-toolkit = "*" 14 | typing-extensions = "*" 15 | 16 | [dev-packages] 17 | 18 | [requires] 19 | python_version = "3.6" 20 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "84512d50f35687680dd404deb2a44af2906cc8fc60492e42ee04f88a89deecd4" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.6" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://mirrors.aliyun.com/pypi/simple/", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "appdirs": { 20 | "hashes": [ 21 | "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41", 22 | "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128" 23 | ], 24 | "version": "==1.4.4" 25 | }, 26 | "atomicwrites": { 27 | "hashes": [ 28 | "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11" 29 | ], 30 | "markers": "sys_platform == 'win32'", 31 | "version": "==1.4.1" 32 | }, 33 | "attrs": { 34 | "hashes": [ 35 | "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6", 36 | "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c" 37 | ], 38 | "markers": "python_version >= '3.5'", 39 | "version": "==22.1.0" 40 | }, 41 | "beautifulsoup4": { 42 | "hashes": [ 43 | "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30", 44 | "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693" 45 | ], 46 | "markers": "python_version >= '3.6'", 47 | "version": "==4.11.1" 48 | }, 49 | "bleach": { 50 | "hashes": [ 51 | "sha256:0900d8b37eba61a802ee40ac0061f8c2b5dee29c1927dd1d233e075ebf5a71da", 52 | "sha256:4d2651ab93271d1129ac9cbc679f524565cc8a1b791909c4a51eac4446a15994" 53 | ], 54 | "markers": "python_version >= '3.6'", 55 | "version": "==4.1.0" 56 | }, 57 | "bs4": { 58 | "hashes": [ 59 | "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" 60 | ], 61 | "version": "==0.0.1" 62 | }, 63 | "certifi": { 64 | "hashes": [ 65 | "sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14", 66 | "sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382" 67 | ], 68 | "markers": "python_version >= '3.6'", 69 | "version": "==2022.9.24" 70 | }, 71 | "charset-normalizer": { 72 | "hashes": [ 73 | "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", 74 | "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" 75 | ], 76 | "markers": "python_version >= '3'", 77 | "version": "==2.0.12" 78 | }, 79 | "click": { 80 | "hashes": [ 81 | "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1", 82 | "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb" 83 | ], 84 | "markers": "python_version >= '3.6'", 85 | "version": "==8.0.4" 86 | }, 87 | "colorama": { 88 | "hashes": [ 89 | "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da", 90 | "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4" 91 | ], 92 | "markers": "sys_platform == 'win32'", 93 | "version": "==0.4.5" 94 | }, 95 | "cssselect": { 96 | "hashes": [ 97 | "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf", 98 | "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc" 99 | ], 100 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 101 | "version": "==1.1.0" 102 | }, 103 | "dataclasses": { 104 | "hashes": [ 105 | "sha256:0201d89fa866f68c8ebd9d08ee6ff50c0b255f8ec63a71c16fda7af82bb887bf", 106 | "sha256:8479067f342acf957dc82ec415d355ab5edb7e7646b90dc6e2fd1d96ad084c97" 107 | ], 108 | "markers": "python_version < '3.7'", 109 | "version": "==0.8" 110 | }, 111 | "docutils": { 112 | "hashes": [ 113 | "sha256:23010f129180089fbcd3bc08cfefccb3b890b0050e1ca00c867036e9d161b98c", 114 | "sha256:679987caf361a7539d76e584cbeddc311e3aee937877c87346f31debc63e9d06" 115 | ], 116 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", 117 | "version": "==0.18.1" 118 | }, 119 | "fake-useragent": { 120 | "hashes": [ 121 | "sha256:2bc361f03a7f7ca68c5be2863d33067f8c56a7076d129d3a5c930377551acf5a", 122 | "sha256:59a16d8cd109dadeef3375a27818d6cc9511fc3ec430fe564b8490cbec3ea861" 123 | ], 124 | "version": "==1.0.1" 125 | }, 126 | "flasgger": { 127 | "hashes": [ 128 | "sha256:0603941cf4003626b4ee551ca87331f1d17b8eecce500ccf1a1f1d3a332fc94a", 129 | "sha256:6ebea406b5beecd77e8da42550f380d4d05a6107bc90b69ce9e77aee7612e2d0" 130 | ], 131 | "index": "pypi", 132 | "version": "==0.9.5" 133 | }, 134 | "flask": { 135 | "hashes": [ 136 | "sha256:59da8a3170004800a2837844bfa84d49b022550616070f7cb1a659682b2e7c9f", 137 | "sha256:e1120c228ca2f553b470df4a5fa927ab66258467526069981b3eb0a91902687d" 138 | ], 139 | "index": "pypi", 140 | "version": "==2.0.3" 141 | }, 142 | "idna": { 143 | "hashes": [ 144 | "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", 145 | "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2" 146 | ], 147 | "markers": "python_version >= '3'", 148 | "version": "==3.4" 149 | }, 150 | "importlib-metadata": { 151 | "hashes": [ 152 | "sha256:65a9576a5b2d58ca44d133c42a241905cc45e34d2c06fd5ba2bafa221e5d7b5e", 153 | "sha256:766abffff765960fcc18003801f7044eb6755ffae4521c8e8ce8e83b9c9b0668" 154 | ], 155 | "markers": "python_version < '3.8'", 156 | "version": "==4.8.3" 157 | }, 158 | "importlib-resources": { 159 | "hashes": [ 160 | "sha256:33a95faed5fc19b4bc16b29a6eeae248a3fe69dd55d4d229d2b480e23eeaad45", 161 | "sha256:d756e2f85dd4de2ba89be0b21dba2a3bbec2e871a42a3a16719258a11f87506b" 162 | ], 163 | "markers": "python_version < '3.7'", 164 | "version": "==5.4.0" 165 | }, 166 | "iniconfig": { 167 | "hashes": [ 168 | "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", 169 | "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32" 170 | ], 171 | "version": "==1.1.1" 172 | }, 173 | "itsdangerous": { 174 | "hashes": [ 175 | "sha256:5174094b9637652bdb841a3029700391451bd092ba3db90600dea710ba28e97c", 176 | "sha256:9e724d68fc22902a1435351f84c3fb8623f303fffcc566a4cb952df8c572cff0" 177 | ], 178 | "markers": "python_version >= '3.6'", 179 | "version": "==2.0.1" 180 | }, 181 | "jinja2": { 182 | "hashes": [ 183 | "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", 184 | "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7" 185 | ], 186 | "markers": "python_version >= '3.6'", 187 | "version": "==3.0.3" 188 | }, 189 | "jsonschema": { 190 | "hashes": [ 191 | "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163", 192 | "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a" 193 | ], 194 | "version": "==3.2.0" 195 | }, 196 | "keyring": { 197 | "hashes": [ 198 | "sha256:17e49fb0d6883c2b4445359434dba95aad84aabb29bbff044ad0ed7100232eca", 199 | "sha256:89cbd74d4683ed164c8082fb38619341097741323b3786905c6dac04d6915a55" 200 | ], 201 | "markers": "python_version >= '3.6'", 202 | "version": "==23.4.1" 203 | }, 204 | "lxml": { 205 | "hashes": [ 206 | "sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318", 207 | "sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c", 208 | "sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b", 209 | "sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000", 210 | "sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73", 211 | "sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d", 212 | "sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb", 213 | "sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8", 214 | "sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2", 215 | "sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345", 216 | "sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94", 217 | "sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e", 218 | "sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b", 219 | "sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc", 220 | "sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a", 221 | "sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9", 222 | "sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc", 223 | "sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387", 224 | "sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb", 225 | "sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7", 226 | "sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4", 227 | "sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97", 228 | "sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67", 229 | "sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627", 230 | "sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7", 231 | "sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd", 232 | "sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3", 233 | "sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7", 234 | "sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130", 235 | "sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b", 236 | "sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036", 237 | "sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785", 238 | "sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca", 239 | "sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91", 240 | "sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc", 241 | "sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536", 242 | "sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391", 243 | "sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3", 244 | "sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d", 245 | "sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21", 246 | "sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3", 247 | "sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d", 248 | "sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29", 249 | "sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715", 250 | "sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed", 251 | "sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25", 252 | "sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c", 253 | "sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785", 254 | "sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837", 255 | "sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4", 256 | "sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b", 257 | "sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2", 258 | "sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067", 259 | "sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448", 260 | "sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d", 261 | "sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2", 262 | "sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc", 263 | "sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c", 264 | "sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5", 265 | "sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84", 266 | "sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8", 267 | "sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf", 268 | "sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7", 269 | "sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e", 270 | "sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb", 271 | "sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b", 272 | "sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3", 273 | "sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad", 274 | "sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8", 275 | "sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f" 276 | ], 277 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", 278 | "version": "==4.9.1" 279 | }, 280 | "markupsafe": { 281 | "hashes": [ 282 | "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298", 283 | "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64", 284 | "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b", 285 | "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194", 286 | "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567", 287 | "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff", 288 | "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724", 289 | "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74", 290 | "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646", 291 | "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35", 292 | "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6", 293 | "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a", 294 | "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6", 295 | "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad", 296 | "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26", 297 | "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38", 298 | "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac", 299 | "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7", 300 | "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6", 301 | "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047", 302 | "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75", 303 | "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f", 304 | "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b", 305 | "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135", 306 | "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8", 307 | "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a", 308 | "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a", 309 | "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1", 310 | "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9", 311 | "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864", 312 | "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914", 313 | "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee", 314 | "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f", 315 | "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18", 316 | "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8", 317 | "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2", 318 | "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d", 319 | "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b", 320 | "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b", 321 | "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86", 322 | "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6", 323 | "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f", 324 | "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb", 325 | "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833", 326 | "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28", 327 | "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e", 328 | "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415", 329 | "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902", 330 | "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f", 331 | "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d", 332 | "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9", 333 | "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d", 334 | "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145", 335 | "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066", 336 | "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c", 337 | "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1", 338 | "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a", 339 | "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207", 340 | "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f", 341 | "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53", 342 | "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd", 343 | "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134", 344 | "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85", 345 | "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9", 346 | "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5", 347 | "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94", 348 | "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509", 349 | "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51", 350 | "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872" 351 | ], 352 | "markers": "python_version >= '3.6'", 353 | "version": "==2.0.1" 354 | }, 355 | "mistune": { 356 | "hashes": [ 357 | "sha256:182cc5ee6f8ed1b807de6b7bb50155df7b66495412836b9a74c8fbdfc75fe36d", 358 | "sha256:9ee0a66053e2267aba772c71e06891fa8f1af6d4b01d5e84e267b4570d4d9808" 359 | ], 360 | "version": "==2.0.4" 361 | }, 362 | "packaging": { 363 | "hashes": [ 364 | "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", 365 | "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" 366 | ], 367 | "markers": "python_version >= '3.6'", 368 | "version": "==21.3" 369 | }, 370 | "parse": { 371 | "hashes": [ 372 | "sha256:9ff82852bcb65d139813e2a5197627a94966245c897796760a3a2a8eb66f020b" 373 | ], 374 | "version": "==1.19.0" 375 | }, 376 | "pkginfo": { 377 | "hashes": [ 378 | "sha256:848865108ec99d4901b2f7e84058b6e7660aae8ae10164e015a6dcf5b242a594", 379 | "sha256:a84da4318dd86f870a9447a8c98340aa06216bfc6f2b7bdc4b8766984ae1867c" 380 | ], 381 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", 382 | "version": "==1.8.3" 383 | }, 384 | "pluggy": { 385 | "hashes": [ 386 | "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", 387 | "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3" 388 | ], 389 | "markers": "python_version >= '3.6'", 390 | "version": "==1.0.0" 391 | }, 392 | "prompt-toolkit": { 393 | "hashes": [ 394 | "sha256:535c29c31216c77302877d5120aef6c94ff573748a5b5ca5b1b1f76f5e700c73", 395 | "sha256:ced598b222f6f4029c0800cefaa6a17373fb580cd093223003475ce32805c35b" 396 | ], 397 | "index": "pypi", 398 | "version": "==3.0.33" 399 | }, 400 | "py": { 401 | "hashes": [ 402 | "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", 403 | "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378" 404 | ], 405 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", 406 | "version": "==1.11.0" 407 | }, 408 | "pyee": { 409 | "hashes": [ 410 | "sha256:5c7e60f8df95710dbe17550e16ce0153f83990c00ef744841b43f371ed53ebea", 411 | "sha256:c09f56e36eb10bf23aa2aacf145f690ded75b990a3d9523fd478b005940303d2" 412 | ], 413 | "version": "==8.2.2" 414 | }, 415 | "pygments": { 416 | "hashes": [ 417 | "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1", 418 | "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42" 419 | ], 420 | "markers": "python_version >= '3.6'", 421 | "version": "==2.13.0" 422 | }, 423 | "pyparsing": { 424 | "hashes": [ 425 | "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb", 426 | "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc" 427 | ], 428 | "markers": "python_full_version >= '3.6.8'", 429 | "version": "==3.0.9" 430 | }, 431 | "pyppeteer": { 432 | "hashes": [ 433 | "sha256:4621bb890e54f43dce84f5139ea4d484a62886be1903c2fcb393af607943538f", 434 | "sha256:85adde940cc96820725db59cbdb13384aefd0dd043858cfa4f1c086c0f9e4137" 435 | ], 436 | "markers": "python_full_version >= '3.6.1' and python_full_version < '4.0.0'", 437 | "version": "==0.2.6" 438 | }, 439 | "pyquery": { 440 | "hashes": [ 441 | "sha256:1fc33b7699455ed25c75282bc8f80ace1ac078b0dda5a933dacbd8b1c1f83963", 442 | "sha256:a388eefb6bc4a55350de0316fbd97cda999ae669b6743ae5b99102ba54f5aa72" 443 | ], 444 | "version": "==1.4.3" 445 | }, 446 | "pyrsistent": { 447 | "hashes": [ 448 | "sha256:097b96f129dd36a8c9e33594e7ebb151b1515eb52cceb08474c10a5479e799f2", 449 | "sha256:2aaf19dc8ce517a8653746d98e962ef480ff34b6bc563fc067be6401ffb457c7", 450 | "sha256:404e1f1d254d314d55adb8d87f4f465c8693d6f902f67eb6ef5b4526dc58e6ea", 451 | "sha256:48578680353f41dca1ca3dc48629fb77dfc745128b56fc01096b2530c13fd426", 452 | "sha256:4916c10896721e472ee12c95cdc2891ce5890898d2f9907b1b4ae0f53588b710", 453 | "sha256:527be2bfa8dc80f6f8ddd65242ba476a6c4fb4e3aedbf281dfbac1b1ed4165b1", 454 | "sha256:58a70d93fb79dc585b21f9d72487b929a6fe58da0754fa4cb9f279bb92369396", 455 | "sha256:5e4395bbf841693eaebaa5bb5c8f5cdbb1d139e07c975c682ec4e4f8126e03d2", 456 | "sha256:6b5eed00e597b5b5773b4ca30bd48a5774ef1e96f2a45d105db5b4ebb4bca680", 457 | "sha256:73ff61b1411e3fb0ba144b8f08d6749749775fe89688093e1efef9839d2dcc35", 458 | "sha256:772e94c2c6864f2cd2ffbe58bb3bdefbe2a32afa0acb1a77e472aac831f83427", 459 | "sha256:773c781216f8c2900b42a7b638d5b517bb134ae1acbebe4d1e8f1f41ea60eb4b", 460 | "sha256:a0c772d791c38bbc77be659af29bb14c38ced151433592e326361610250c605b", 461 | "sha256:b29b869cf58412ca5738d23691e96d8aff535e17390128a1a52717c9a109da4f", 462 | "sha256:c1a9ff320fa699337e05edcaae79ef8c2880b52720bc031b219e5b5008ebbdef", 463 | "sha256:cd3caef37a415fd0dae6148a1b6957a8c5f275a62cca02e18474608cb263640c", 464 | "sha256:d5ec194c9c573aafaceebf05fc400656722793dac57f254cd4741f3c27ae57b4", 465 | "sha256:da6e5e818d18459fa46fac0a4a4e543507fe1110e808101277c5a2b5bab0cd2d", 466 | "sha256:e79d94ca58fcafef6395f6352383fa1a76922268fa02caa2272fff501c2fdc78", 467 | "sha256:f3ef98d7b76da5eb19c37fda834d50262ff9167c65658d1d8f974d2e4d90676b", 468 | "sha256:f4c8cabb46ff8e5d61f56a037974228e978f26bfefce4f61a4b1ac0ba7a2ab72" 469 | ], 470 | "markers": "python_version >= '3.6'", 471 | "version": "==0.18.0" 472 | }, 473 | "pysocks": { 474 | "hashes": [ 475 | "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299", 476 | "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", 477 | "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0" 478 | ], 479 | "version": "==1.7.1" 480 | }, 481 | "pytest": { 482 | "hashes": [ 483 | "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", 484 | "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171" 485 | ], 486 | "index": "pypi", 487 | "version": "==7.0.1" 488 | }, 489 | "pywin32-ctypes": { 490 | "hashes": [ 491 | "sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942", 492 | "sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98" 493 | ], 494 | "markers": "sys_platform == 'win32'", 495 | "version": "==0.2.0" 496 | }, 497 | "pyyaml": { 498 | "hashes": [ 499 | "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf", 500 | "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293", 501 | "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b", 502 | "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57", 503 | "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b", 504 | "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4", 505 | "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07", 506 | "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba", 507 | "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9", 508 | "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287", 509 | "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513", 510 | "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0", 511 | "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782", 512 | "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0", 513 | "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92", 514 | "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f", 515 | "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2", 516 | "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc", 517 | "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1", 518 | "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c", 519 | "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86", 520 | "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4", 521 | "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c", 522 | "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34", 523 | "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b", 524 | "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d", 525 | "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c", 526 | "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb", 527 | "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7", 528 | "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737", 529 | "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3", 530 | "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d", 531 | "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358", 532 | "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53", 533 | "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78", 534 | "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803", 535 | "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a", 536 | "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f", 537 | "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174", 538 | "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5" 539 | ], 540 | "markers": "python_version >= '3.6'", 541 | "version": "==6.0" 542 | }, 543 | "readme-renderer": { 544 | "hashes": [ 545 | "sha256:262510fe6aae81ed4e94d8b169077f325614c0b1a45916a80442c6576264a9c2", 546 | "sha256:dfb4d17f21706d145f7473e0b61ca245ba58e810cf9b2209a48239677f82e5b0" 547 | ], 548 | "markers": "python_version >= '3.6'", 549 | "version": "==34.0" 550 | }, 551 | "requests": { 552 | "extras": [ 553 | "socks" 554 | ], 555 | "hashes": [ 556 | "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", 557 | "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" 558 | ], 559 | "index": "pypi", 560 | "version": "==2.27.1" 561 | }, 562 | "requests-html": { 563 | "hashes": [ 564 | "sha256:7e929ecfed95fb1d0994bb368295d6d7c4d06b03fcb900c33d7d0b17e6003947", 565 | "sha256:cb8a78cf829c4eca9d6233f28524f65dd2bfaafb4bdbbc407f0a0b8f487df6e2" 566 | ], 567 | "index": "pypi", 568 | "version": "==0.10.0" 569 | }, 570 | "requests-toolbelt": { 571 | "hashes": [ 572 | "sha256:18565aa58116d9951ac39baa288d3adb5b3ff975c4f25eee78555d89e8f247f7", 573 | "sha256:62e09f7ff5ccbda92772a29f394a49c3ad6cb181d568b1337626b2abb628a63d" 574 | ], 575 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 576 | "version": "==0.10.1" 577 | }, 578 | "rfc3986": { 579 | "hashes": [ 580 | "sha256:270aaf10d87d0d4e095063c65bf3ddbc6ee3d0b226328ce21e036f946e421835", 581 | "sha256:a86d6e1f5b1dc238b218b012df0aa79409667bb209e58da56d0b94704e712a97" 582 | ], 583 | "version": "==1.5.0" 584 | }, 585 | "setuptools": { 586 | "hashes": [ 587 | "sha256:22c7348c6d2976a52632c67f7ab0cdf40147db7789f9aed18734643fe9cf3373", 588 | "sha256:4ce92f1e1f8f01233ee9952c04f6b81d1e02939d6e1b488428154974a4d0783e" 589 | ], 590 | "markers": "python_version >= '3.6'", 591 | "version": "==59.6.0" 592 | }, 593 | "six": { 594 | "hashes": [ 595 | "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", 596 | "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" 597 | ], 598 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 599 | "version": "==1.16.0" 600 | }, 601 | "soupsieve": { 602 | "hashes": [ 603 | "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", 604 | "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" 605 | ], 606 | "markers": "python_version >= '3.6'", 607 | "version": "==2.3.2.post1" 608 | }, 609 | "tomli": { 610 | "hashes": [ 611 | "sha256:05b6166bff487dc068d322585c7ea4ef78deed501cc124060e0f238e89a9231f", 612 | "sha256:e3069e4be3ead9668e21cb9b074cd948f7b3113fd9c8bba083f48247aab8b11c" 613 | ], 614 | "markers": "python_version >= '3.6'", 615 | "version": "==1.2.3" 616 | }, 617 | "tqdm": { 618 | "hashes": [ 619 | "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4", 620 | "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1" 621 | ], 622 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 623 | "version": "==4.64.1" 624 | }, 625 | "twine": { 626 | "hashes": [ 627 | "sha256:8efa52658e0ae770686a13b675569328f1fba9837e5de1867bfe5f46a9aefe19", 628 | "sha256:d0550fca9dc19f3d5e8eadfce0c227294df0a2a951251a4385797c8a6198b7c8" 629 | ], 630 | "index": "pypi", 631 | "version": "==3.8.0" 632 | }, 633 | "typing-extensions": { 634 | "hashes": [ 635 | "sha256:1a9462dcc3347a79b1f1c0271fbe79e844580bb598bafa1ed208b94da3cdcd42", 636 | "sha256:21c85e0fe4b9a155d0799430b0ad741cdce7e359660ccbd8b530613e8df88ce2" 637 | ], 638 | "index": "pypi", 639 | "version": "==4.1.1" 640 | }, 641 | "urllib3": { 642 | "hashes": [ 643 | "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc", 644 | "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8" 645 | ], 646 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", 647 | "version": "==1.26.13" 648 | }, 649 | "w3lib": { 650 | "hashes": [ 651 | "sha256:13df15f8c17b163de0fd5faa892c1ad143e190dfcbdb98534bb975eb37c6c7d6", 652 | "sha256:c5d966f86ae3fb546854478c769250c3ccb7581515b3221bcd2f864440000188" 653 | ], 654 | "markers": "python_version >= '3.6'", 655 | "version": "==2.0.1" 656 | }, 657 | "wcwidth": { 658 | "hashes": [ 659 | "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784", 660 | "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83" 661 | ], 662 | "version": "==0.2.5" 663 | }, 664 | "webencodings": { 665 | "hashes": [ 666 | "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", 667 | "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" 668 | ], 669 | "version": "==0.5.1" 670 | }, 671 | "websockets": { 672 | "hashes": [ 673 | "sha256:0dd4eb8e0bbf365d6f652711ce21b8fd2b596f873d32aabb0fbb53ec604418cc", 674 | "sha256:1d0971cc7251aeff955aa742ec541ee8aaea4bb2ebf0245748fbec62f744a37e", 675 | "sha256:1d6b4fddb12ab9adf87b843cd4316c4bd602db8d5efd2fb83147f0458fe85135", 676 | "sha256:230a3506df6b5f446fed2398e58dcaafdff12d67fe1397dff196411a9e820d02", 677 | "sha256:276d2339ebf0df4f45df453923ebd2270b87900eda5dfd4a6b0cfa15f82111c3", 678 | "sha256:2cf04601633a4ec176b9cc3d3e73789c037641001dbfaf7c411f89cd3e04fcaf", 679 | "sha256:3ddff38894c7857c476feb3538dd847514379d6dc844961dc99f04b0384b1b1b", 680 | "sha256:48c222feb3ced18f3dc61168ca18952a22fb88e5eb8902d2bf1b50faefdc34a2", 681 | "sha256:51d04df04ed9d08077d10ccbe21e6805791b78eac49d16d30a1f1fe2e44ba0af", 682 | "sha256:597c28f3aa7a09e8c070a86b03107094ee5cdafcc0d55f2f2eac92faac8dc67d", 683 | "sha256:5c8f0d82ea2468282e08b0cf5307f3ad022290ed50c45d5cb7767957ca782880", 684 | "sha256:7189e51955f9268b2bdd6cc537e0faa06f8fffda7fb386e5922c6391de51b077", 685 | "sha256:7df3596838b2a0c07c6f6d67752c53859a54993d4f062689fdf547cb56d0f84f", 686 | "sha256:826ccf85d4514609219725ba4a7abd569228c2c9f1968e8be05be366f68291ec", 687 | "sha256:836d14eb53b500fd92bd5db2fc5894f7c72b634f9c2a28f546f75967503d8e25", 688 | "sha256:85db8090ba94e22d964498a47fdd933b8875a1add6ebc514c7ac8703eb97bbf0", 689 | "sha256:85e701a6c316b7067f1e8675c638036a796fe5116783a4c932e7eb8e305a3ffe", 690 | "sha256:900589e19200be76dd7cbaa95e9771605b5ce3f62512d039fb3bc5da9014912a", 691 | "sha256:9147868bb0cc01e6846606cd65cbf9c58598f187b96d14dd1ca17338b08793bb", 692 | "sha256:9e7fdc775fe7403dbd8bc883ba59576a6232eac96dacb56512daacf7af5d618d", 693 | "sha256:ab5ee15d3462198c794c49ccd31773d8a2b8c17d622aa184f669d2b98c2f0857", 694 | "sha256:ad893d889bc700a5835e0a95a3e4f2c39e91577ab232a3dc03c262a0f8fc4b5c", 695 | "sha256:b2e71c4670ebe1067fa8632f0d081e47254ee2d3d409de54168b43b0ba9147e0", 696 | "sha256:b43b13e5622c5a53ab12f3272e6f42f1ce37cd5b6684b2676cb365403295cd40", 697 | "sha256:b4ad84b156cf50529b8ac5cc1638c2cf8680490e3fccb6121316c8c02620a2e4", 698 | "sha256:be5fd35e99970518547edc906efab29afd392319f020c3c58b0e1a158e16ed20", 699 | "sha256:caa68c95bc1776d3521f81eeb4d5b9438be92514ec2a79fececda814099c8314", 700 | "sha256:d144b350045c53c8ff09aa1cfa955012dd32f00c7e0862c199edcabb1a8b32da", 701 | "sha256:d2c2d9b24d3c65b5a02cac12cbb4e4194e590314519ed49db2f67ef561c3cf58", 702 | "sha256:e9e5fd6dbdf95d99bc03732ded1fc8ef22ebbc05999ac7e0c7bf57fe6e4e5ae2", 703 | "sha256:ebf459a1c069f9866d8569439c06193c586e72c9330db1390af7c6a0a32c4afd", 704 | "sha256:f31722f1c033c198aa4a39a01905951c00bd1c74f922e8afc1b1c62adbcdd56a", 705 | "sha256:f68c352a68e5fdf1e97288d5cec9296664c590c25932a8476224124aaf90dbcd" 706 | ], 707 | "markers": "python_full_version >= '3.6.1'", 708 | "version": "==9.1" 709 | }, 710 | "werkzeug": { 711 | "hashes": [ 712 | "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8", 713 | "sha256:b863f8ff057c522164b6067c9e28b041161b4be5ba4d0daceeaa50a163822d3c" 714 | ], 715 | "markers": "python_version >= '3.6'", 716 | "version": "==2.0.3" 717 | }, 718 | "zipp": { 719 | "hashes": [ 720 | "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832", 721 | "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc" 722 | ], 723 | "markers": "python_version >= '3.6'", 724 | "version": "==3.6.0" 725 | } 726 | }, 727 | "develop": {} 728 | } 729 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Weibo Scraper 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/weibo-scraper.svg)](https://pypi.org/project/weibo-scraper/) 4 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/weibo-scraper.svg)](https://docs.python.org/3/whatsnew/3.6.html) 5 | [![Build Status](https://travis-ci.org/Xarrow/weibo-scraper.svg?branch=master)](https://travis-ci.org/Xarrow/weibo-scraper) 6 | [![codecov](https://codecov.io/gh/Xarrow/weibo-scraper/branch/master/graph/badge.svg)](https://codecov.io/gh/Xarrow/weibo-scraper) 7 | 8 | ---- 9 | 10 | Simple weibo tweet scraper . Crawl weibo tweets without authorization. 11 | There are many limitations in official API . 12 | In general , we can inspect mobile site which has it's own API by Chrome. 13 | 14 | ---- 15 | 16 | # Why 17 | 18 | 1. Crawl weibo data in order to research big data . 19 | 20 | 2. Back up data for weibo's shameful blockade . 21 | 22 | 23 | ---- 24 | # Installation 25 | 26 | 27 | ### pip 28 | 29 | ```shell 30 | 31 | $ pip install weibo-scraper 32 | 33 | ``` 34 | 35 | Or Upgrade it. 36 | 37 | 38 | ```shell 39 | 40 | $ pip install --upgrade weibo-scraper 41 | 42 | ``` 43 | 44 | ### pipenv 45 | 46 | ```shell 47 | 48 | $ pipenv install weibo-scraper 49 | 50 | ``` 51 | Or Upgrade it. 52 | 53 | ```shell 54 | $ pipenv update --outdated # show packages which are outdated 55 | 56 | $ pipenv update weibo-scraper # just update weibo-scraper 57 | 58 | ``` 59 | 60 | 61 | Only Python 3.6+ is supported 62 | 63 | ---- 64 | # Usage 65 | 66 | ### CLI 67 | 68 | ```bash 69 | 70 | $ weibo-scraper -h 71 | 72 | usage: weibo-scraper [-h] [-u U] [-p P] [-o O] [-f FORMAT] 73 | [-efn EXPORTED_FILE_NAME] [-s] [-d] [--more] [-v] 74 | 75 | weibo-scraper-1.0.7-beta 🚀 76 | 77 | optional arguments: 78 | -h, --help show this help message and exit 79 | -u U username [nickname] which want to exported 80 | -p P pages which exported [ default 1 page ] 81 | -o O output file path which expected [ default 'current 82 | dir' ] 83 | -f FORMAT, --format FORMAT 84 | format which expected [ default 'txt' ] 85 | -efn EXPORTED_FILE_NAME, --exported_file_name EXPORTED_FILE_NAME 86 | file name which expected 87 | -s, --simplify simplify available info 88 | -d, --debug open debug mode 89 | --more more 90 | -v, --version weibo scraper version 91 | 92 | ``` 93 | 94 | ### API 95 | 96 | 1. Firstly , you can get weibo profile by `name` or `uid` . 97 | 98 | ```python 99 | >>> from weibo_scraper import get_weibo_profile 100 | >>> weibo_profile = get_weibo_profile(name='来去之间',) 101 | >>> .... 102 | ``` 103 | You will get weibo profile response which is type of `weibo_base.UserMeta`, and this response include fields as below 104 | 105 | field|chinese|type|sample|ext 106 | ---|---|---|---|--- 107 | id|用户id|str|| 108 | screen_name|微博昵称|Option[str]|| 109 | avatar_hd|高清头像|Option[str]|'https://ww2.sinaimg.cn/orj480/4242e8adjw8elz58g3kyvj20c80c8myg.jpg'| 110 | cover_image_phone|手机版封面|Option[str]|'https://tva1.sinaimg.cn/crop.0.0.640.640.640/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg'| 111 | description| 描述|Option[str]|| 112 | follow_count|关注数|Option[int]|3568| 113 | follower_count|被关注数|Option[int]|794803| 114 | gender|性别|Option[str]|'m'/'f'| 115 | raw_user_response|原始返回|Option[dict]|| 116 | 117 | 118 | 2. Secondly , via `tweet_container_id` to get weibo tweets is a rare way to use but it also works well . 119 | 120 | ```python 121 | >>> from weibo_scraper import get_weibo_tweets 122 | >>> for tweet in get_weibo_tweets(tweet_container_id='1076033637346297',pages=1): 123 | >>> print(tweet) 124 | >>> .... 125 | 126 | ``` 127 | 128 | 3. Of Course , you can also get raw weibo tweets by nick name which is exist . And the param of `pages` is optional . 129 | 130 | ```python 131 | >>> from weibo_scraper import get_weibo_tweets_by_name 132 | >>> for tweet in get_weibo_tweets_by_name(name='嘻红豆', pages=1): 133 | >>> print(tweet) 134 | >>> .... 135 | ``` 136 | 137 | 3. If you want to get all tweets , you can set the param of `pages` as `None` 138 | 139 | ```python 140 | >>> from weibo_scraper import get_weibo_tweets_by_name 141 | >>> for tweet in get_weibo_tweets_by_name(name='嘻红豆', pages=None): 142 | >>> print(tweet) 143 | >>> .... 144 | ``` 145 | 146 | 4. You can also get formatted tweets via api of `weibo_scrapy.get_formatted_weibo_tweets_by_name`, 147 | 148 | ```python 149 | >>> from weibo_scraper import get_formatted_weibo_tweets_by_name 150 | >>> result_iterator = get_formatted_weibo_tweets_by_name(name='嘻红豆', pages=None) 151 | >>> for user_meta in result_iterator: 152 | >>> if user_meta is not None: 153 | >>> for tweetMeta in user_meta.cards_node: 154 | >>> print(tweetMeta.mblog.text) 155 | >>> .... 156 | ``` 157 | 158 | ![img](asserts/weibo_tweets.png) 159 | 160 | 5. Get realtime hot words 161 | 162 | ```python 163 | hotwords = weibo_scraper.get_realtime_hotwords() 164 | for hw in hotwords: 165 | print(str(hw)) 166 | ``` 167 | 168 | 6. Get realtime hot words in every interval 169 | 170 | ```python 171 | wt = Timer(name="realtime_hotword_timer", fn=weibo_scraper.get_realtime_hotwords, interval=1) 172 | wt.set_ignore_ex(True) 173 | wt.scheduler() 174 | ``` 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | # LICENSE 194 | 195 | MIT 196 | 197 | This Project Powered By Jetbrains OpenSource License 198 | 199 | ![img](asserts/jetbrains.svg) -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate -------------------------------------------------------------------------------- /asserts/jetbrains.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 8 | 9 | 10 | 11 | 14 | 15 | 16 | 17 | 18 | 19 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 31 | 32 | 33 | 34 | 35 | 36 | 39 | 40 | 41 | 42 | 43 | 45 | 47 | 48 | 51 | 54 | 56 | 57 | 59 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /asserts/weibo_tweets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xarrow/weibo-scraper/0a8cce7edfd797b106236186d64922732a5007db/asserts/weibo_tweets.png -------------------------------------------------------------------------------- /packing_and_upload.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | echo ">>> delete exist packing " 3 | rm -rf build 4 | rm -rf dist 5 | rm -rf weibo_scraper.egg-info 6 | echo ">>> running python setup.py sdist" 7 | python setup.py sdist 8 | echo ">>> running python setup.py bdist_wheel --universal" 9 | python setup.py bdist_wheel --universal 10 | 11 | echo ">>> running twine check" 12 | twine check sdist/* 13 | echo ">>> running twine upload dist/* " 14 | twine upload dist/* --skip-existing -------------------------------------------------------------------------------- /persistence/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Verion: 1.0 5 | Author: Helixcs 6 | Site: https://github.com/Xarrow/weibo-scraper 7 | File: __init__.py 8 | Time: 6/9/18 9 | """ 10 | 11 | from .persistence import dispatch -------------------------------------------------------------------------------- /persistence/persistence.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Verion: 1.0 5 | Author: Helixcs 6 | Site: https://github.com/Xarrow/weibo-scraper 7 | File: persistence.py 8 | Time: 6/9/18 9 | Reference : https://www.toptal.com/python/python-design-patterns 10 | """ 11 | import logging 12 | from contextlib import contextmanager 13 | import time 14 | import os 15 | import pickle 16 | 17 | from weibo_scraper import get_formatted_weibo_tweets_by_name 18 | from weibo_base import rt_logger,logger,is_debug 19 | 20 | 21 | DEFAULT_EXPORT_FILENAME = "export_%s" % int(time.time()) 22 | DEFAULT_EXPORT_PATH = os.getcwd() 23 | DEFAULT_DOT = "." 24 | 25 | 26 | class WeiboScraperPersistenceException(Exception): 27 | def __init__(self, message): 28 | self.message = message 29 | 30 | 31 | @contextmanager 32 | def open_file(file_name: str): 33 | file = open(file=file_name, mode='wb') 34 | yield file 35 | file.flush() 36 | file.close() 37 | 38 | 39 | class BaseAction(object): 40 | def __init__(self, 41 | name: str, 42 | pages: int = None, 43 | export_file_path: str = None, 44 | export_file_name: str = None, 45 | export_file_suffix: str = None, 46 | is_simplify: bool = None): 47 | """ 48 | BaseAction 49 | :param name: weibo name which wants to search and persistence 50 | :param pages: max pages which requests 51 | :param export_file_path: export file path 52 | :param export_file_name: export file name 53 | :param export_file_suffix: export file suffix , examples : txt , sql , html 54 | :param is_simplify: whether export pure weibo tweets 55 | """ 56 | if name is None or name == '': 57 | raise WeiboScraperPersistenceException("persistence need param of 'name' which you want to search !") 58 | 59 | self.name = name 60 | self.pages = pages 61 | self.export_file_path = export_file_path or DEFAULT_EXPORT_PATH 62 | self.export_file_name = export_file_name 63 | self.export_file_suffix = export_file_suffix 64 | self.export_file_suffix = DEFAULT_DOT + self.export_file_suffix if not self.export_file_suffix.startswith( 65 | DEFAULT_DOT) else self.export_file_suffix 66 | 67 | if self.export_file_path is not None: 68 | if not os.path.isdir(self.export_file_path): 69 | raise WeiboScraperPersistenceException("export file path is not a dir !") 70 | # reset export_file_name 71 | # sample as "嘻红豆_export_1534784328.json" or custom file name "嘻红豆.txt" 72 | self.export_file_name = self.export_file_name if self.export_file_name is not None else self.name + "_" + DEFAULT_EXPORT_FILENAME 73 | self.export_file_name = self.export_file_name + self.export_file_suffix if not self.export_file_name.__contains__( 74 | DEFAULT_DOT) else self.export_file_name 75 | self.is_simplfy = True if is_simplify is None else is_simplify 76 | 77 | def fetch_data(self, *args, **kwargs): 78 | pass 79 | 80 | def execute(self, *args, **kwargs): 81 | # 父类执行 82 | pass 83 | 84 | 85 | class WeiboTweetsAction(BaseAction): 86 | """ weibo tweets action""" 87 | 88 | def fetch_data(self, *args, **kwargs): 89 | tweets_iterator = get_formatted_weibo_tweets_by_name(name=self.name, pages=self.pages) 90 | for tweets_parser in tweets_iterator: 91 | for tweet_meta in tweets_parser.cards_node: 92 | yield tweet_meta 93 | 94 | 95 | class WeiboFollowerAndFansAction(BaseAction): 96 | """ weibo followers and fans action""" 97 | 98 | def fetch_data(self, *args, **kwargs): 99 | pass 100 | 101 | 102 | class TweetsPersistence(object): 103 | def __init__(self, action: BaseAction): 104 | self.action = action 105 | 106 | @rt_logger 107 | def execute_with_de(self, *args, **kwargs): 108 | self.action.execute(*args, **kwargs) 109 | 110 | def persistence(self, *args, **kwargs): 111 | # TODO function to AOP 112 | if is_debug: 113 | self.execute_with_de(*args, **kwargs) 114 | else: 115 | self.action.execute(*args, **kwargs) 116 | 117 | 118 | # -------------------------- implement ------------------------ 119 | 120 | class HTMLPersistenceImpl(WeiboTweetsAction): 121 | """ export as html file """ 122 | 123 | def __init__(self, 124 | name: str = None, 125 | pages: int = None, 126 | export_file_path=None, 127 | export_file_name=None, 128 | export_file_suffix: str = "html", 129 | is_simplify: bool = False) -> None: 130 | super().__init__(name=name, 131 | pages=pages, 132 | export_file_path=export_file_path, 133 | export_file_name=export_file_name, 134 | export_file_suffix=export_file_suffix, 135 | is_simplify=is_simplify) 136 | 137 | def execute(self, *args, **kwargs): 138 | # do nothing 139 | pass 140 | 141 | 142 | class SerializablePersistenceImpl(WeiboTweetsAction): 143 | def __init__(self, 144 | name: str = None, 145 | pages: int = None, 146 | export_file_path=None, 147 | export_file_name=None, 148 | export_file_suffix: str = "pickle", 149 | is_simplify: bool = False) -> None: 150 | super().__init__(name=name, 151 | pages=pages, 152 | export_file_path=export_file_path, 153 | export_file_name=export_file_name, 154 | export_file_suffix=export_file_suffix, 155 | is_simplify=is_simplify) 156 | 157 | def execute(self, *args, **kwargs): 158 | with open_file(file_name=os.path.join(self.export_file_path, self.export_file_name)) as pickle_file: 159 | for tweet_meta in self.fetch_data(): 160 | if self.is_simplfy: 161 | single_line = "id: " + tweet_meta.mblog.id + "\t\t" + \ 162 | "source: " + tweet_meta.mblog.source + "\t\t" + \ 163 | "text: " + tweet_meta.mblog.text + "\t\t" 164 | if tweet_meta.mblog.pics_node and len(tweet_meta.mblog.pics_node) > 0: 165 | single_line += "pics: " 166 | for pic in tweet_meta.mblog.pics_node: 167 | single_line = single_line + pic.large_url + "\t\t" 168 | else: 169 | single_line = str(tweet_meta.raw_card) 170 | single_line += "\t\t\n" 171 | pickle.dump(single_line, pickle_file) 172 | pass 173 | 174 | 175 | class TxtPersistenceImpl(WeiboTweetsAction): 176 | """ export as txt file """ 177 | 178 | def __init__(self, 179 | name: str = None, 180 | pages: int = None, 181 | export_file_path=None, 182 | export_file_name=None, 183 | export_file_suffix: str = "txt", 184 | is_simplify: bool = False) -> None: 185 | super().__init__(name=name, 186 | pages=pages, 187 | export_file_path=export_file_path, 188 | export_file_name=export_file_name, 189 | export_file_suffix=export_file_suffix, 190 | is_simplify=is_simplify) 191 | 192 | def execute(self): 193 | with open_file(file_name=os.path.join(self.export_file_path, self.export_file_name)) as text_file: 194 | for tweet_meta in self.fetch_data(): 195 | if self.is_simplfy: 196 | single_line = "id: " + tweet_meta.mblog.id + "\t\t" + \ 197 | "source: " + tweet_meta.mblog.source + "\t\t" + \ 198 | "text: " + tweet_meta.mblog.text + "\t\t" 199 | if tweet_meta.mblog.pics_node and len(tweet_meta.mblog.pics_node) > 0: 200 | single_line += "pics: " 201 | for pic in tweet_meta.mblog.pics_node: 202 | single_line = single_line + pic.large_url + "\t\t" 203 | else: 204 | # FIXME upgrade weibo_base.py 205 | single_line = str(tweet_meta.raw_card_node) 206 | 207 | single_line += "\t\t\n" 208 | text_file.write(bytes(single_line, encoding='utf-8')) 209 | 210 | 211 | class CSVPersistenceImpl(BaseAction): 212 | """export as csv file""" 213 | pass 214 | 215 | 216 | class SQLPersistenceImpl(BaseAction): 217 | """ export as sql file """ 218 | pass 219 | 220 | 221 | class JSONPersistenceImpl(WeiboTweetsAction): 222 | """ export as json file""" 223 | 224 | def __init__(self, 225 | name: str = None, 226 | pages: int = None, 227 | export_file_path=None, 228 | export_file_name=None, 229 | export_file_suffix: str = "json", 230 | is_simplify: bool = False) -> None: 231 | super().__init__(name=name, 232 | pages=pages, 233 | export_file_path=export_file_path, 234 | export_file_name=export_file_name, 235 | export_file_suffix=export_file_suffix, 236 | is_simplify=is_simplify) 237 | 238 | def execute(self): 239 | with open_file(file_name=os.path.join(self.export_file_path, self.export_file_name)) as json_file: 240 | for tweet_meta in self.fetch_data(): 241 | if self.is_simplfy: 242 | single_line = "id: " + tweet_meta.mblog.id + "\t\t" + \ 243 | "source: " + tweet_meta.mblog.source + "\t\t" + \ 244 | "text: " + tweet_meta.mblog.text + "\t\t" 245 | if tweet_meta.mblog.pics_node and len(tweet_meta.mblog.pics_node) > 0: 246 | single_line += "pics: " 247 | for pic in tweet_meta.mblog.pics_node: 248 | single_line = single_line + pic.large_url + "\t\t" 249 | else: 250 | single_line = str(tweet_meta.raw_card) 251 | json_file.write(bytes(single_line, encoding='utf-8')) 252 | json_file.write(bytes('\t\t\n', encoding='utf-8')) 253 | 254 | 255 | def dispatch(name: str, pages: int = None, is_simplify: bool = True, persistence_format: str = "txt", 256 | export_file_path: str = None, export_file_name: str = None, is_debug: bool = False): 257 | # if not is_debug: 258 | # logger.getLogger().setLevel(logging.DEBUG) 259 | if persistence_format == 'txt': 260 | pst = TxtPersistenceImpl(name=name, pages=pages, is_simplify=is_simplify, export_file_path=export_file_path, 261 | export_file_name=export_file_name) 262 | elif persistence_format == 'sql': 263 | pst = SQLPersistenceImpl(name=name, pages=pages, is_simplify=is_simplify, export_file_path=export_file_path, 264 | export_file_name=export_file_name) 265 | elif persistence_format == 'html': 266 | pst = HTMLPersistenceImpl(name=name, pages=pages, is_simplify=is_simplify, export_file_path=export_file_path, 267 | export_file_name=export_file_name) 268 | elif persistence_format == 'csv': 269 | pst = CSVPersistenceImpl(name=name, pages=pages, is_simplify=is_simplify, export_file_path=export_file_path, 270 | export_file_name=export_file_name) 271 | elif persistence_format == 'pickle': 272 | pst = SerializablePersistenceImpl(name=name, pages=pages, is_simplify=is_simplify, 273 | export_file_path=export_file_path, export_file_name=export_file_name) 274 | elif persistence_format == 'json': 275 | pst = JSONPersistenceImpl(name=name, pages=pages, is_simplify=is_simplify, export_file_path=export_file_path, 276 | export_file_name=export_file_name) 277 | else: 278 | raise WeiboScraperPersistenceException("Unknown persistence format in [txt, sql ,html, csv, pickle]") 279 | tpst = TweetsPersistence(action=pst) 280 | tpst.persistence() 281 | -------------------------------------------------------------------------------- /samples/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Verion: 1.0 5 | Author: Helixcs 6 | Site: https://github.com/Xarrow/weibo-scraper 7 | File: __init__.py.py 8 | Time: 5/27/18 9 | """ 10 | -------------------------------------------------------------------------------- /samples/weibo_flasgger/FLASGGER_README.md: -------------------------------------------------------------------------------- 1 | ## weibo scraper flasgger api 2 | ---- 3 | The sample of flasgger provide an elegant api document for weibo scraper. 4 | 5 | ---- 6 | ### 1. Install 7 | 8 | **Install** weibo-scraper by pipenv 9 | ```bash 10 | $ pipenv install weibo-scraper 11 | 12 | ``` 13 | 14 | ### 2. Run 15 | **Run** flasgger_api.py 16 | 17 | ```bash 18 | $ python samples/weibo_flasgger/flasgger_api.py 19 | ``` 20 | 21 | ### 3. Visit 22 | **Vist** `http://127.0.0.1:5000` in browser , and you will find api document as below. 23 | 24 | ![img](https://raw.githubusercontent.com/Xarrow/weibo-scraper/search_name/samples/weibo_flasgger/imgs/weibo-flasgger.png) -------------------------------------------------------------------------------- /samples/weibo_flasgger/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Author: Helixcs 5 | Site: https://github.com/Xarrow/weibo-scraper 6 | File: __init__.py.py 7 | Time: 5/24/18 8 | """ 9 | 10 | from .flasgger_api import * -------------------------------------------------------------------------------- /samples/weibo_flasgger/flasgger_api.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Author: Helixcs 5 | Site: https://github.com/Xarrow/weibo-scraper 6 | File: flasgger_api.py 7 | Time: 5/24/18 8 | """ 9 | import logging 10 | from flask import Flask, jsonify, request, make_response, render_template 11 | from flasgger import Swagger, swag_from 12 | 13 | from weibo_base.weibo_api import * 14 | 15 | level = logging.DEBUG 16 | format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' 17 | datefmt = '%Y-%m-%d %H:%M' 18 | logging.basicConfig(level=level, format=format, datefmt=datefmt) 19 | logger = logging.getLogger(__name__) 20 | 21 | app = Flask(__name__) 22 | 23 | DEFAULT_CONFIG = { 24 | "headers": [], 25 | "specs": [{ 26 | "endpoint": 'apispec_1', 27 | "route": '/apispec_1.json', 28 | "rule_filter": lambda rule: True, # all in 29 | "model_filter": lambda tag: True, # all in 30 | } 31 | ], 32 | "static_url_path": "/flasgger_static", 33 | # "static_folder": "static", # must be set by user 34 | "swagger_ui": True, 35 | "specs_route": "/" 36 | } 37 | 38 | # see :https://github.com/OAI/OpenAPI-Specification/blob/master/versions/2.0.md#contactObject 39 | TEMPLATE = { 40 | "swagger": "2.0", 41 | "info": { 42 | "title": "Weibo Scraper API", 43 | "description": "weibo scraper 接口列表", 44 | "host": "127.0.0.1:5002", 45 | "basePath": "/", 46 | "schemes": [ 47 | "http", "https" 48 | ], 49 | "consumes": ['application/json'], 50 | "tags": ["zhangjian", ], 51 | "contact": { 52 | "name": "Helixcs", 53 | "email": "zhangjian12424@gmail.com", 54 | "url": "https://xarrow.github.io/weibo-scraper", 55 | }, 56 | "version": "1.0.4" 57 | }, 58 | } 59 | 60 | THIS_PAGE_CONFIG = Swagger.DEFAULT_CONFIG 61 | THIS_PAGE_CONFIG.update({"specs_route": "/"}) 62 | swagger = Swagger(app=app, config=THIS_PAGE_CONFIG, template=TEMPLATE) 63 | 64 | 65 | @app.route("/api/weiboBase/search_by_name/", methods=["GET"]) 66 | @swag_from("ymls/search_by_name.yml") 67 | def search_by_name_api(name): 68 | return jsonify(search_by_name(name=name)) 69 | 70 | 71 | @app.route("/api/weiboBase/weibo_getIndex/", methods=['GET']) 72 | @swag_from('ymls/weibo_getIndex.yml') 73 | def weibo_getIndex_api(uid_value): 74 | return jsonify(weibo_getIndex(uid_value=uid_value)) 75 | 76 | 77 | @app.route("/api/weiboBase/weibo_tweets//", methods=["GET"]) 78 | @swag_from('ymls/weibo_tweets.yml') 79 | def weibo_tweets_api(containerid, page): 80 | return jsonify(weibo_tweets(containerid=containerid, page=page)) 81 | 82 | 83 | # weibo component api 84 | 85 | @app.route('/api/weiboComponet/exist_get_uid/') 86 | @swag_from('ymls/exist_get_uid.yml') 87 | def exist_get_uid_api(name): 88 | return jsonify(exist_get_uid(name=name)) 89 | 90 | 91 | @app.route('/api.weiboComponent/get_weibo_containerid/', methods=["GET"]) 92 | @swag_from("ymls/get_weibo_containerid.yml") 93 | def get_weibo_containerid_api(uid): 94 | return jsonify(get_weibo_containerid(uid=uid)) 95 | 96 | 97 | app.run(port=5001, debug=True) 98 | -------------------------------------------------------------------------------- /samples/weibo_flasgger/imgs/weibo-flasgger.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xarrow/weibo-scraper/0a8cce7edfd797b106236186d64922732a5007db/samples/weibo_flasgger/imgs/weibo-flasgger.png -------------------------------------------------------------------------------- /samples/weibo_flasgger/ymls/exist_get_uid.yml: -------------------------------------------------------------------------------- 1 | 微博组件API —— 查询用户uid 2 | 查询用户uid 3 | --- 4 | tags: 5 | - 微博组件 API 接口 6 | parameters: 7 | - name: name 8 | in: path 9 | type: string 10 | default: 嘻红豆 11 | require: true 12 | description: 微博用户名 13 | definitions: 14 | UidResult: 15 | type: object 16 | properties: 17 | exist: 18 | type: boolean 19 | name: 20 | type: string 21 | uid: 22 | type: integer 23 | reponse: 24 | 200: 25 | description: 成功返回 26 | schema: 27 | $ref: '#/definitions/UidResult' 28 | example: {'exist': True, 'name': '嘻红豆', 'uid': 3637346297} -------------------------------------------------------------------------------- /samples/weibo_flasgger/ymls/get_weibo_containerid.yml: -------------------------------------------------------------------------------- 1 | 微博组件API —— 查询用户微博内容 containerid 2 | 查询用户微博 containerid 3 | --- 4 | tags: 5 | - 微博组件 API 接口 6 | parameters: 7 | - name: uid 8 | in: path 9 | type: string 10 | default: 1843242321 11 | require: true 12 | description: uid 13 | reponse: 14 | 200: 15 | description: 成功返回 -------------------------------------------------------------------------------- /samples/weibo_flasgger/ymls/search_by_name.yml: -------------------------------------------------------------------------------- 1 | 微博基础API - 用户名查询 2 | 微博用户名查询 3 | --- 4 | tags: 5 | - 微博基础API接口 6 | parameters: 7 | - name: name 8 | in: path 9 | type: string 10 | default: Helixcs 11 | required: true 12 | description: 微博用户名 13 | response: 14 | 200: 15 | description: 成功返回 16 | swagger: 2.0 -------------------------------------------------------------------------------- /samples/weibo_flasgger/ymls/weibo_getIndex.yml: -------------------------------------------------------------------------------- 1 | 微博基础API - 用户个人信息查询 2 | 用户个人信息查询 3 | --- 4 | tags: 5 | - 微博基础API接口 6 | parameters: 7 | - name: uid_value 8 | in: path 9 | type: string 10 | default: 1843242321 11 | required: true 12 | description: 微博用户id 13 | response: 14 | 200: 15 | description: 成功返回 16 | swagger: 2.0 -------------------------------------------------------------------------------- /samples/weibo_flasgger/ymls/weibo_tweets.yml: -------------------------------------------------------------------------------- 1 | 微博基础API - 微博查询 2 | 微博查询 3 | --- 4 | tags: 5 | - 微博基础API接口 6 | parameters: 7 | - name: containerid 8 | in: path 9 | type: string 10 | default: 1076031843242321 11 | required: true 12 | description: containerid 13 | - name: page 14 | in: path 15 | type: string 16 | default: 1 17 | require: true 18 | description: page 19 | response: 20 | 200: 21 | description: 成功返回 22 | swagger: 2.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import sys 4 | from shutil import rmtree 5 | 6 | from setuptools import find_packages, setup, Command 7 | 8 | here = os.path.abspath(os.path.dirname(__file__)) 9 | 10 | with io.open(os.path.join(here, 'README.md'), encoding='UTF-8') as f: 11 | long_description = '\n' + f.read() 12 | 13 | 14 | class UploadCommand(Command): 15 | """Support setup.py upload.""" 16 | 17 | description = 'Build and publish the package.' 18 | user_options = [] 19 | 20 | @staticmethod 21 | def status(s): 22 | """Prints things in bold.""" 23 | print('\033[1m{0}\033[0m'.format(s)) 24 | 25 | def initialize_options(self): 26 | pass 27 | 28 | def finalize_options(self): 29 | pass 30 | 31 | def run(self): 32 | try: 33 | self.status('Removing previous builds…') 34 | rmtree(os.path.join(here, 'dist')) 35 | except OSError: 36 | pass 37 | 38 | self.status('Building Source and Wheel (universal) distribution…') 39 | os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable)) 40 | 41 | self.status('Uploading the package to PyPi via Twine…') 42 | os.system('twine upload dist/*') 43 | 44 | sys.exit() 45 | 46 | 47 | setup( 48 | version="1.0.7rc1.dev3", 49 | long_description="", 50 | long_description_content_type="text/markdown", 51 | name="weibo-scraper", 52 | url="https://github.com/Xarrow/weibo-scraper", 53 | author="helixcs", 54 | author_email="zhangjian12424@gmail.com", 55 | license="MIT", 56 | classifiers=[ 57 | # Trove classifiers 58 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 59 | 'License :: OSI Approved :: MIT License', 60 | 'Programming Language :: Python', 61 | 'Programming Language :: Python :: 3.6', 62 | 'Programming Language :: Python :: Implementation :: CPython', 63 | 'Programming Language :: Python :: Implementation :: PyPy' 64 | ], 65 | install_requires=['requests'], 66 | keywords="weibo scraper crawl", 67 | # If your package is a single module, use this instead of 'packages': 68 | py_modules=['weibo_scraper', 'weibo_scraper_cli'], 69 | # If your package has custom module , 70 | # Full list :https://docs.python.org/3.6/distutils/setupscript.html 71 | packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 72 | python_requires='>=3.6', 73 | # $ setup.py publish support. 74 | cmdclass={ 75 | 'upload': UploadCommand, 76 | }, 77 | # packing to command tool interface 78 | entry_points={ 79 | 'console_scripts': ['weibo-scraper=weibo_scraper_cli:cli'] 80 | } 81 | 82 | ) 83 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding=utf-8 -*- 2 | """ 3 | @Author xuanji.zj 4 | @Email xuanji.zj@alibaba-inc.com 5 | @Time 2021/6/6 7:38 下午 6 | @desc Add New Functions In __init__.py 7 | 8 | """ 9 | 10 | import sys 11 | import os 12 | import weibo_scraper 13 | from weibo_scraper import set_debug 14 | from weibo_base.weibo_component import exist_get_uid, get_tweet_containerid 15 | from weibo_base.weibo_util import Timer 16 | import logging 17 | 18 | if __name__ == '__main__': 19 | set_debug() 20 | uid = exist_get_uid(name='嘻红豆') 21 | print(uid) 22 | containerid = get_tweet_containerid(uid=uid.get('uid')) 23 | print(containerid) 24 | 25 | result = weibo_scraper.get_weibo_tweets_by_name(name="嘻红豆", pages=1) 26 | for tweet in result: 27 | print(tweet) 28 | result = weibo_scraper.get_weibo_tweets(tweet_container_id=containerid, pages=1) 29 | for tweet in result: 30 | print(tweet) 31 | 32 | wp = weibo_scraper.get_weibo_profile(name='嘻红豆') 33 | print(wp.raw_user_response) 34 | 35 | hotwords = weibo_scraper.get_realtime_hotwords() 36 | for hw in hotwords: 37 | print(str(hw)) 38 | pass 39 | wt = Timer(name="realtime_hotword_timer", fn=weibo_scraper.get_realtime_hotwords, interval=1) 40 | wt.set_ignore_ex(True) 41 | wt.scheduler() 42 | -------------------------------------------------------------------------------- /tests/test_weibo_scraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Author: Helixcs 5 | Site: https://github.com/Xarrow/weibo-scraper 6 | File: test_weibo_scraper.py 7 | Time: 5/11/18 8 | """ 9 | import unittest 10 | 11 | import weibo_scraper 12 | from weibo_base import * 13 | 14 | 15 | class TestWeiboScraper(unittest.TestCase): 16 | def test_get_weibo_tweets(self): 17 | result = weibo_scraper.get_weibo_tweets(tweet_container_id='1076031843242321', pages=1) 18 | for tweet in result: 19 | print(tweet) 20 | self.assertIsNotNone(result) 21 | 22 | def test_weibo_base_search_name(self): 23 | result = weibo_api.search_by_name("Helixcs") 24 | self.assertIsNotNone(result) 25 | 26 | # def test_weibo_getIndex(self): 27 | # """Helixcs need login to get cookies""" 28 | # result = weibo_api.weibo_getIndex(uid_value='1843242321') 29 | # self.assertIsNotNone(result) 30 | 31 | def test_is_name_exist(self): 32 | result = exist_get_uid(name="Helixcs") 33 | print(result) # 1843242321 34 | self.assertIsNotNone(result) 35 | 36 | love_result1 = exist_get_uid(name="嘻红豆") 37 | print(love_result1) # 3637346297 38 | self.assertIsNotNone(love_result1) 39 | 40 | love_result2 = exist_get_uid(search_by_name_response='', name='嘻红豆') 41 | print(love_result2) 42 | self.assertIsNotNone(love_result2) 43 | 44 | test_result = exist_get_uid(name='暴走大事件') 45 | print(test_result) # None 46 | self.assertIsNotNone(test_result) 47 | 48 | def test_get_weibo_containerid(self): 49 | # Helixcs need login to get cookies 50 | # common weibo id , uid is from Helixcs 51 | # test_result = get_tweet_containerid(uid="1843242321") 52 | # print('Containerid from Helixcs is : ', test_result) # 1076031843242321 53 | # self.assertIsNotNone(test_result) 54 | 55 | # second profile for weibo api , uid is from 来去之间 56 | test_result2 = get_tweet_containerid(uid='1111681197') 57 | print('Containerid from 来去之间 is : ', test_result2) # 2304131111681197_-_ 58 | self.assertIsNotNone(test_result2) 59 | 60 | # second profile for weibo api , uid is from 嘻红豆 61 | test_result3 = get_tweet_containerid(uid='3637346297') 62 | print('Containerid from 嘻红豆 is:', test_result3) 63 | self.assertIsNotNone(test_result3) 64 | 65 | def test_weibo_tweets(self): 66 | result = weibo_tweets(containerid='1076033637346297', page=1) 67 | print(result) 68 | 69 | def test_get_weibo_tweets_by_name(self): 70 | result_iterator = weibo_scraper.get_weibo_tweets_by_name(name='嘻红豆', pages=1) 71 | for i in result_iterator: 72 | print(i) 73 | result_iterator2 = weibo_scraper.get_weibo_tweets_by_name(name='nicknameisnotexist', pages=1) 74 | for i in result_iterator2: 75 | print(i) 76 | 77 | def test_get_containerid_from_second_profile(self): 78 | result_iterator = weibo_scraper.get_weibo_tweets_by_name(name='来去之间', pages=1) 79 | for i in result_iterator: 80 | print(i) 81 | self.assertIsNotNone(result_iterator) 82 | 83 | def test_weibo_get_index_parser(self): 84 | # test get weibo profile 85 | get_inex_response = weibo_getIndex(uid_value='1111681197') 86 | wgip = WeiboGetIndexParser(get_index_api_response=get_inex_response) 87 | print(wgip) 88 | 89 | def test_weibo_parser(self): 90 | # Helixcs need login to get cookies SUB 91 | # tweet_response = weibo_scraper.weibo_tweets(containerid='1076031843242321', page=1) 92 | # wp = WeiboTweetParser(tweet_get_index_response=tweet_response) 93 | # print(wp) 94 | pass 95 | 96 | def test_get_weibo_profile(self): 97 | wp = weibo_scraper.get_weibo_profile(name='嘻红豆') 98 | print(wp.raw_user_response) 99 | 100 | wp_uid = weibo_scraper.get_weibo_profile(uid='3637346297') 101 | print(wp_uid.raw_user_response) 102 | 103 | def test_follows_and_followers(self): 104 | for user in weibo_scraper.get_follows(name='嘻红豆', max_item_limit=1): 105 | print(user) 106 | 107 | print("==" * 10) 108 | for user in weibo_scraper.get_followers(name='嘻红豆', max_item_limit=1): 109 | print(user) 110 | 111 | def test_comments_request_with_structure(self): 112 | """ 113 | https://m.weibo.cn/comments/hotflow?id=4257059677028285&mid=4257059677028285 114 | :return: 115 | """ 116 | 117 | weibo_comments_res = weibo_comments(id="4257059677028285", mid='4257059677028285') 118 | wcp = WeiboCommentParser(weibo_comments_res) 119 | print(wcp.comment_meta) 120 | 121 | def test_txt_export(self): 122 | from persistence import persistence 123 | persistence.dispatch(name='嘻红豆', pages=1, is_simplify=True, persistence_format="txt", 124 | export_file_name="梁群茹2txt", is_debug=True) 125 | 126 | def test_weibo_tweets_with_comments(self): 127 | """weibo comments""" 128 | for i in weibo_scraper.get_formatted_weibo_tweets_by_name(name='嘻红豆', with_comments=True, pages=1): 129 | for j in i.cards_node: 130 | print(str(j.mblog.comment_parser)) 131 | 132 | 133 | if __name__ == '__main__': 134 | unittest.main() 135 | -------------------------------------------------------------------------------- /weibo_base/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Author: Helixcs 5 | Site: https://github.com/Xarrow/weibo-scraper 6 | File: __init__.py.py 7 | Time: 5/19/18 8 | """ 9 | from .weibo_typing import * 10 | from .weibo_api import * 11 | from .weibo_component import * 12 | from .weibo_util import * 13 | from .weibo_parser import * 14 | -------------------------------------------------------------------------------- /weibo_base/weibo_api.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Author: Helixcs 5 | Site: https://github.com/Xarrow/weibo-scraper 6 | File: weibo_api.py 7 | Time: 5/19/18 8 | """ 9 | from typing import Optional 10 | from weibo_base.weibo_util import RequestProxy, WeiboApiException 11 | 12 | requests = RequestProxy() 13 | Response = Optional[dict] 14 | 15 | _GET_INDEX = "https://m.weibo.cn/api/container/getIndex" 16 | _GET_SECOND = "https://m.weibo.cn/api/container/getSecond" 17 | _COMMENTS_HOTFLOW = "https://m.weibo.cn/comments/hotflow" 18 | 19 | 20 | def search_by_name(name: str) -> Response: 21 | """get summary info which searched by name, 22 | this api is like 'https://m.weibo.cn/api/container/getIndex?queryVal=&containerid=100103type%3D3%26q%3D' 23 | 24 | >>> from weibo_base import search_by_name 25 | >>> _response = search_by_name('Helixcs') 26 | :param name: nick name which you want to search 27 | :return json string including summary info 28 | """ 29 | _params = {'queryVal': name, 'containerid': '100103type%3D3%26q%3D' + name} 30 | _response = requests.get(url=_GET_INDEX, params=_params) 31 | if _response.status_code == 200: 32 | return _response.json() 33 | return None 34 | 35 | 36 | def weibo_getIndex(uid_value: str) -> Response: 37 | """ 38 | get personal summary info which request by uid, and uid is got by 'search_by_name' 39 | this api is like 'https://m.weibo.cn/api/container/getIndex?type=uid&value=' 40 | 41 | >>> from weibo_base import weibo_getIndex 42 | >>> _response = weibo_getIndex('1843242321') 43 | :param uid_value: 44 | :return: 45 | """ 46 | _params = {"type": "uid", "value": uid_value} 47 | _response = requests.get(url=_GET_INDEX, params=_params) 48 | if _response.status_code == 200: 49 | return _response.json() 50 | return None 51 | 52 | 53 | def weibo_tweets(containerid: str, page: int) -> Response: 54 | """ 55 | get person weibo tweets which from contaninerid in page, 56 | this api is like 'https://m.weibo.cn/container/getIndex?containerid=&page=' 57 | >>> from weibo_base import weibo_tweets 58 | >>> _response = weibo_tweets(contaierid='1076031843242321',page=1) 59 | :param containerid: 60 | :param page: page 61 | :return: 62 | """ 63 | _params = {"containerid": containerid, "page": page} 64 | _response = requests.get(url=_GET_INDEX, params=_params) 65 | if _response.status_code == 200 and _response.json().get("ok") == 1: 66 | return _response.json() 67 | raise WeiboApiException( 68 | "weibo_tweets request failed, url={0},params={1},response={2}".format(_GET_INDEX, _params, 69 | _response if _response is None else _response.text)) 70 | 71 | 72 | def weibo_containerid(containerid: str, page: int) -> Response: 73 | """ 74 | 75 | :param containerid: 76 | :param page: 77 | :return: 78 | """ 79 | _params = {"containerid": containerid, "page": page} 80 | _response = requests.get(url=_GET_INDEX, params=_params) 81 | if _response.status_code == 200 and _response.json().get("ok") == 1: 82 | return _response.json() 83 | raise WeiboApiException( 84 | "weibo_containerid request failed, url={0},params={1},response={2}".format(_GET_INDEX, _params, _response)) 85 | 86 | 87 | def weibo_second(containerid: str, page: int) -> Response: 88 | """ 89 | https://m.weibo.cn/api/container/getSecond 90 | :param containerid: 91 | :param page: 92 | :return: 93 | """ 94 | _params = {"containerid": containerid, "page": page} 95 | _response = requests.get(url=_GET_SECOND, params=_params) 96 | if _response.status_code == 200 and _response.json().get("ok") == 1: 97 | return _response.json() 98 | raise WeiboApiException( 99 | "weibo_second request failed, url={0},params={1},response={2}".format(_GET_SECOND, _params, _response)) 100 | 101 | 102 | def weibo_comments(id: str, mid: str) -> Response: 103 | """ 104 | https://m.weibo.cn/comments/hotflow?id=4257059677028285&mid=4257059677028285 105 | get comments from userId and mid 106 | :param id: userId 107 | :param mid: mid 108 | :return: 109 | """ 110 | _params = {"id": id, "mid": mid} 111 | _response = requests.get(url=_COMMENTS_HOTFLOW, params=_params) 112 | if _response.status_code == 200 and _response.json().get("ok") == 1: 113 | return _response.json() 114 | raise WeiboApiException( 115 | "weibo_comments request failed, url={0},params={1},response={2}".format(_COMMENTS_HOTFLOW, _params, _response)) 116 | 117 | 118 | def realtime_hotword(): 119 | _params = {"containerid": "106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot"} 120 | _response = requests.get(url=_GET_INDEX, params=_params) 121 | 122 | if _response.status_code == 200 and _response.json().get("ok") == 1: 123 | return _response.json() 124 | raise WeiboApiException( 125 | "weibo_comments request failed, url={0},params={1},response={2}".format(_COMMENTS_HOTFLOW, _params, _response)) 126 | 127 | 128 | # ----------------------------------- use by cookie --------------------------- 129 | HEADER = { 130 | "Connection": "keep-alive", 131 | "Host": "passport.weibo.cn", 132 | "Upgrade-Insecure-Requests": "1", 133 | "Referer": "https://passport.weibo.cn/signin/login?entry=mweibo&r=http%3A%2F%2Fweibo.cn%2F&backTitle=%CE%A2%B2%A9&vt=", 134 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36" 135 | } 136 | 137 | AFTER_HEADER = { 138 | "Accept": "application/json, text/plain, */*", 139 | "Host": "m.weibo.cn", 140 | "Origin": "https://m.weibo.cn", 141 | "Referer": "https://m.weibo.cn/u/", 142 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", 143 | "X-Requested-With": "XMLHttpRequest" 144 | } 145 | 146 | PC_HEADER = { 147 | "Host": "weibo.com", 148 | "Origin": "https://weibo.com", 149 | "Referer": "https://weibo.com/ZhangJianForV/home?topnav=1&wvr=6", 150 | "User-Agent": "Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.25 (KHTML, like Gecko) Version/11.0 Mobile/15A5304j Safari/604.1", 151 | "X-Requested-With": "XMLHttpRequest" 152 | } 153 | 154 | 155 | class WeiboV2(object): 156 | def __init__(self, username, password): 157 | self.username = username 158 | self.password = password 159 | self.request = requests.session() 160 | self.cookies = None 161 | self.st = None 162 | self.userid = None 163 | 164 | def login_for_sso(self): 165 | login_url = 'https://passport.weibo.cn/sso/login' 166 | data = { 167 | 'username': self.username, 168 | 'password': self.password, 169 | 'savestate': '1', 170 | 'r': 'http://weibo.cn/', 171 | 'ec': '0', 172 | 'pagerefer': '', 173 | 'entry': 'mweibo', 174 | 'wentry': '', 175 | 'loginfrom': '', 176 | 'client_id': '', 177 | 'code': '', 178 | 'qq': '', 179 | 'mainpageflag': '1', 180 | 'hff': '', 181 | 'hfp': '' 182 | } 183 | headers = HEADER 184 | r_login = self.request.post(url=login_url, data=data, headers=headers) 185 | 186 | if not r_login.text.__contains__('20000000'): 187 | if r_login.json().get('retcode') == 50050011: 188 | errurl = r_login.json().get('data').get('errurl') 189 | self.phone_verify(errurl) 190 | else: 191 | raise Exception("login_for_sso failed !", r_login.text) 192 | 193 | self.cookies = r_login.cookies.get_dict() 194 | 195 | def phone_verify(self, errurl): 196 | print(errurl) 197 | req = self.request.get(errurl) 198 | print(req.text) 199 | pass 200 | 201 | def get_uid(self): 202 | """get uid""" 203 | response = self.request.get(url='https://m.weibo.cn/', cookies=self.cookies) 204 | if response.status_code == 200 and response.text.__contains__('uid') > 0: 205 | self.userid = response.text[response.text.index('"uid":"') + len('"uid":"'):response.text.index('","ctrl"')] 206 | 207 | def get_st(self): 208 | """get st """ 209 | r = self.request.get(url='https://m.weibo.cn/u/' + self.userid, cookies=self.cookies) 210 | if r.status_code == 200 and r.text.__contains__("st") > 0: 211 | _response = r.text 212 | if str(_response).__contains__("st: '") > 0: 213 | self.st = _response[_response.index("st: '") + len("st: '"):_response.index("',\n login:")] 214 | elif str(_response).__contains__('"st":"') > 0: 215 | self.st = _response[_response.index('"st":"') + len('"st":"'):_response.index('","isInClient')] 216 | 217 | def check_cookie_expired(self): 218 | """check cookies whether expired""" 219 | response = self.request.get(url='https://m.weibo.cn/', cookies=self.cookies) 220 | if response.status_code == 200: 221 | return response.text.__contains__(self.userid) 222 | return False 223 | 224 | def check_cookies(self): 225 | """ 226 | check cookie 227 | """ 228 | if self.cookies is None or not self.check_cookie_expired(): 229 | return False 230 | return True 231 | 232 | def re_login(self): 233 | """login retry""" 234 | self.login_for_sso() 235 | self.get_uid() 236 | self.get_st() 237 | 238 | def _weibo_getIndex(self, userid): 239 | """ 240 | 微博概要内容API 241 | https://m.weibo.cn/api/container/getIndex?type=uid&value=3637346297 242 | :param value: 243 | :return: 244 | """ 245 | api = 'http://m.weibo.cn/api/container/getIndex' 246 | param = {"type": "uid", "value": userid} 247 | return self.request.get(url=api, params=param) 248 | 249 | def _weibo_content(self, containerid, page=1): 250 | """ 251 | 微博内容API 252 | 1076033637346297 253 | https://m.weibo.cn/api/container/getIndex?containerid=1076033637346297 254 | :param containerid: 255 | :return: 256 | """ 257 | api = "https://m.weibo.cn/api/container/getIndex" 258 | params = {"containerid": containerid, "page": page} 259 | return self.request.get(url=api, params=params) 260 | 261 | def send_words_on_pc(self, word): 262 | """ 263 | PC端发送微博 264 | https://weibo.com/aj/mblog/add 265 | ======================= 266 | title:有什么新鲜事想告诉大家? 267 | location:v6_content_home 268 | text:[doge] 269 | appkey: 270 | style_type:1 271 | pic_id: 272 | tid: 273 | pdetail: 274 | rank:0 275 | rankid: 276 | pub_source:page_2 277 | longtext:1 278 | topic_id:1022: 279 | pub_type:dialog 280 | _t:0 281 | :param word: 282 | :return: 283 | """ 284 | api = 'https://weibo.com/aj/mblog/add?ajwvr=6&__rnd=1511200888604' 285 | data = { 286 | "title": "有什么新鲜事想告诉大家?", 287 | "location": "v6_content_home", 288 | "text": word, 289 | "appkey": "", 290 | "style_type": "1", 291 | "pic_id": "", 292 | "tid": "", 293 | "pdetail": "", 294 | "rank": "0", 295 | "rankid": "", 296 | "pub_source": "page_2", 297 | "longtext": "1", 298 | "topic_id": "1022:", 299 | "pub_type": "dialog", 300 | "_t": 0 301 | } 302 | return self.request.post(url=api, data=data, cookies=self.cookies, headers=PC_HEADER).text 303 | -------------------------------------------------------------------------------- /weibo_base/weibo_component.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Verion: 1.0 5 | Author: Helixcs 6 | Site: https://github.com/Xarrow/weibo-scraper 7 | File: weibo_component.py 8 | Time: 11/25/18 9 | """ 10 | # =========== api component ============== 11 | from typing import Dict 12 | from weibo_base.weibo_api import search_by_name 13 | from weibo_base.weibo_parser import weibo_getIndex, WeiboGetIndexParser 14 | 15 | 16 | def exist_get_uid(search_by_name_response: str = None, name: str = "") -> Dict: 17 | """ 18 | whether name is exist in response which from search api, if exist ,return uid 19 | :param search_by_name_response: 20 | :param name: 21 | :return: 22 | """ 23 | if not search_by_name_response or str(search_by_name_response) == '': 24 | search_by_name_response = search_by_name(name) 25 | # bad request 26 | if search_by_name_response.get('ok') != 1: 27 | return {"exist": False, "name": name, "uid": None} 28 | card_type = [card for card in search_by_name_response.get("data").get("cards") if card.get('card_type') == 11] 29 | if len(card_type) < 1: 30 | return {"exist": False, "name": name, "uid": None} 31 | 32 | user = card_type[0].get('card_group')[0].get('user') 33 | screen_name = user.get('screen_name') 34 | if screen_name == name: 35 | return {"exist": True, "name": name, "uid": user.get('id')} 36 | return {"exist": False, "name": name, "uid": None} 37 | 38 | 39 | def get_tweet_containerid(weibo_get_index_response: str = None, uid: str = ""): 40 | """ 41 | get weibo_containerid 42 | :param weibo_get_index_response: 43 | :param uid: uid 44 | :return: weibo_containerid 45 | """ 46 | 47 | if weibo_get_index_response is None or str(weibo_get_index_response) == '': 48 | weibo_get_index_response = weibo_getIndex(uid) 49 | if weibo_get_index_response.get('ok') != 1: 50 | return None 51 | 52 | weibo_get_index_parser = WeiboGetIndexParser(get_index_api_response=weibo_get_index_response) 53 | return weibo_get_index_parser.tweet_containerid 54 | -------------------------------------------------------------------------------- /weibo_base/weibo_exception.py: -------------------------------------------------------------------------------- 1 | # -*- coding=utf-8 -*- 2 | """ 3 | @Author xuanji.zj 4 | @Email xuanji.zj@alibaba-inc.com 5 | @Time 2021/6/2 4:26 下午 6 | @desc Add New Functions In weibo_exception 7 | 8 | """ 9 | 10 | 11 | class ApiException(Exception): 12 | def __init__(self, *args, **kwargs): 13 | pass 14 | -------------------------------------------------------------------------------- /weibo_base/weibo_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Verion: 1.0 5 | Author: Helixcs 6 | Site: https://github.com/Xarrow/weibo-scraper 7 | File: weibo_parser.py 8 | Time: 11/25/18 9 | """ 10 | 11 | import datetime 12 | import re 13 | 14 | from weibo_base.weibo_util import logger 15 | from weibo_base.weibo_api import weibo_tweets, weibo_getIndex, WeiboApiException, weibo_comments 16 | from weibo_base.weibo_typing import _JSONResponse, _StrFieldResponse, _IntFieldResponse 17 | from typing import List, Optional 18 | 19 | now = datetime.datetime.now() 20 | CURRENT_TIME = now.strftime('%Y-%m-%d %H:%M:%S') 21 | CURRENT_YEAR = now.strftime('%Y') 22 | CURRENT_YEAR_WITH_DATE = now.strftime('%Y-%m-%d') 23 | 24 | 25 | # ========== User Metadata =============== 26 | class BaseParser(object): 27 | def __init__(self, raw_response: dict): 28 | self._get_time = CURRENT_TIME 29 | self._raw_response = raw_response 30 | 31 | @property 32 | def raw_response(self) -> _JSONResponse: 33 | return self._raw_response 34 | 35 | 36 | class UserMeta(object): 37 | """weibo user meta data """ 38 | 39 | def __init__(self, user_node: dict): 40 | self.user_node = user_node 41 | 42 | @property 43 | def raw_user_response(self) -> _JSONResponse: 44 | return self.user_node 45 | 46 | @property 47 | def id(self) -> _StrFieldResponse: 48 | return self.user_node.get('id') 49 | 50 | @property 51 | def screen_name(self) -> _StrFieldResponse: 52 | return self.user_node.get('screen_name') 53 | 54 | @property 55 | def profile_image_url(self) -> _StrFieldResponse: 56 | return self.user_node.get('profile_image_url') 57 | 58 | @property 59 | def profile_url(self) -> _StrFieldResponse: 60 | return self.user_node.get('profile_url') 61 | 62 | @property 63 | def description(self) -> _StrFieldResponse: 64 | return self.user_node.get('description') 65 | 66 | @property 67 | def gender(self) -> _StrFieldResponse: 68 | return self.user_node.get('gender') 69 | 70 | @property 71 | def followers_count(self) -> _IntFieldResponse: 72 | return self.user_node.get('followers_count') 73 | 74 | @property 75 | def follow_count(self) -> _IntFieldResponse: 76 | return self.user_node.get('follow_count') 77 | 78 | @property 79 | def cover_image_phone(self) -> _StrFieldResponse: 80 | return self.user_node.get('cover_image_phone') 81 | 82 | @property 83 | def avatar_hd(self) -> _StrFieldResponse: 84 | return self.user_node.get('avatar_hd') 85 | 86 | def __repr__(self): 87 | return "".format(repr(self.id), repr(self.screen_name), repr(self.description), 89 | repr(self.gender), 90 | repr(self.avatar_hd), repr(self.profile_image_url)) 91 | 92 | 93 | # ============== Comments Metadata====================== 94 | 95 | class CommentMeta(object): 96 | __slots__ = ["_comment_meta"] 97 | 98 | def __init__(self, comment_meta: _JSONResponse) -> None: 99 | self._comment_meta = comment_meta 100 | 101 | @property 102 | def raw_comment_meta(self) -> _JSONResponse: 103 | return self._comment_meta 104 | 105 | @raw_comment_meta.setter 106 | def raw_comment_meta(self, value: _JSONResponse): 107 | self._comment_meta = value 108 | 109 | @property 110 | def created_at(self) -> _StrFieldResponse: 111 | return None if self._comment_meta is None else self._comment_meta.get("created_at") 112 | 113 | @property 114 | def id(self) -> _StrFieldResponse: 115 | return None if self._comment_meta is None else self._comment_meta.get("id") 116 | 117 | @property 118 | def rootid(self) -> _JSONResponse: 119 | return None if self._comment_meta is None else self._comment_meta.get("rootid") 120 | 121 | @property 122 | def floor_number(self) -> _JSONResponse: 123 | return None if self._comment_meta is None else self._comment_meta.get("floor_number") 124 | 125 | @property 126 | def text(self) -> _JSONResponse: 127 | return None if self._comment_meta is None else self._comment_meta.get("text") 128 | 129 | @property 130 | def user(self) -> Optional[UserMeta]: 131 | return None if self._comment_meta is None else UserMeta(user_node=self._comment_meta.get("user")) 132 | 133 | @property 134 | def mid(self) -> _StrFieldResponse: 135 | return None if self._comment_meta is None else self._comment_meta.get("mid") 136 | 137 | @property 138 | def comments(self): 139 | return None if self._comment_meta is None else self._comment_meta.get("comments") 140 | 141 | @property 142 | def max_id(self) -> _IntFieldResponse: 143 | return None if self._comment_meta is None else self._comment_meta.get("max_id") 144 | 145 | @property 146 | def total_number(self) -> _IntFieldResponse: 147 | return None if self._comment_meta is None else self._comment_meta.get("total_number") 148 | 149 | @property 150 | def isLikedByMblogAuthor(self): 151 | return None if self._comment_meta is None else self._comment_meta.get("isLikedByMblogAuthor") 152 | 153 | @property 154 | def bid(self) -> _StrFieldResponse: 155 | return None if self._comment_meta is None else self._comment_meta.get("bid") 156 | 157 | @property 158 | def source(self) -> _StrFieldResponse: 159 | return None if self._comment_meta is None else self._comment_meta.get("source") 160 | 161 | @property 162 | def like_count(self) -> _IntFieldResponse: 163 | return None if self._comment_meta is None else self._comment_meta.get("like_count") 164 | 165 | def __repr__(self): 166 | return r"".format(repr(self.id), repr(self.mid), repr(self.text)) 167 | 168 | 169 | _ListCommentMeta = List[CommentMeta] 170 | 171 | 172 | class WeiboCommentParser(object): 173 | """ weibo comments structure 174 | sample as :https://m.weibo.cn/comments/hotflow?id=4257059677028285&mid=4257059677028285 175 | """ 176 | 177 | __slots__ = ['_comment_node'] 178 | 179 | def __init__(self, comment_node: _JSONResponse) -> None: 180 | self._comment_node = comment_node 181 | 182 | @property 183 | def raw_comment_node(self): 184 | return self._comment_node 185 | 186 | @raw_comment_node.setter 187 | def raw_comment_node(self, value: _JSONResponse): 188 | self._comment_node = value 189 | 190 | @property 191 | def outer_data_node(self) -> _JSONResponse: 192 | if self._comment_node is None: return None 193 | if self._comment_node.get("data") is None: 194 | return None 195 | return self._comment_node.get("data") 196 | 197 | @property 198 | def total_number(self) -> _IntFieldResponse: 199 | return None if self.outer_data_node is None else self.outer_data_node.get("total_number") 200 | 201 | @property 202 | def comment_meta(self) -> _ListCommentMeta: 203 | return None if self.outer_data_node is None \ 204 | else [CommentMeta(comment_meta=single_comment_node) 205 | for single_comment_node in self.outer_data_node.get("data")] 206 | 207 | def __repr__(self): 208 | return r"".format(repr(self.raw_comment_node)) 209 | 210 | 211 | class PicMeta(object): 212 | def __init__(self, pic_node: dict) -> None: 213 | self.pic_node = pic_node 214 | 215 | @property 216 | def raw_pics(self) -> _JSONResponse: 217 | return self.pic_node 218 | 219 | @property 220 | def pid(self) -> _StrFieldResponse: 221 | return self.pic_node.get('pid') if self.pic_node.get('pid') is not None else None 222 | 223 | @property 224 | def url(self) -> _StrFieldResponse: 225 | return self.pic_node.get("url") if self.pic_node.get("url") is not None else None 226 | 227 | @property 228 | def large_url(self) -> _StrFieldResponse: 229 | return self.pic_node.get('large').get('url') if self.pic_node.get('large') is not None else None 230 | 231 | 232 | class MBlogMeta(object): 233 | __slots__ = ['_mblog_node', '_comment_parser'] 234 | 235 | def __init__(self, mblog_node): 236 | self._mblog_node = mblog_node 237 | self._comment_parser = None 238 | 239 | @property 240 | def raw_mblog_node(self) -> _JSONResponse: 241 | return self._mblog_node 242 | 243 | @raw_mblog_node.setter 244 | def raw_mblog_node(self, value: _JSONResponse): 245 | self._mblog_node = value 246 | 247 | @property 248 | def comment_parser(self) -> WeiboCommentParser: 249 | return self._comment_parser 250 | 251 | @comment_parser.setter 252 | def comment_parser(self, value: WeiboCommentParser): 253 | if value is None: 254 | global comment_response 255 | try: 256 | comment_response = weibo_comments(id=self.id, mid=self.mid) 257 | self._comment_parser = WeiboCommentParser(comment_response) 258 | except Exception as ex: 259 | logger.error( 260 | "MBlogMeta#comment_parser(settler) value is None , request weibo comments occurred an exception," 261 | "ex=%s , comment_response=%s" % (ex, comment_response)) 262 | self._comment_parser = None 263 | else: 264 | self._comment_parser = value 265 | 266 | @property 267 | def created_at(self) -> _StrFieldResponse: 268 | created_at = self._mblog_node.get('created_at') 269 | # sample as "08-01" -> "2018-08-01" 270 | if len(created_at) < 9 and "-" in created_at: 271 | created_at = CURRENT_YEAR + "-" + created_at 272 | # sample as "几分钟" 273 | if not str(created_at).__contains__("-"): 274 | created_at = CURRENT_YEAR_WITH_DATE 275 | return created_at 276 | 277 | @property 278 | def id(self) -> _StrFieldResponse: 279 | return self._mblog_node.get('id') if self._mblog_node is not None else None 280 | 281 | @property 282 | def idstr(self) -> _StrFieldResponse: 283 | return self._mblog_node.get('idstr') if self._mblog_node is not None else None 284 | 285 | @property 286 | def mid(self) -> _StrFieldResponse: 287 | return self._mblog_node.get('mid') if self._mblog_node is not None else None 288 | 289 | @property 290 | def text(self) -> _StrFieldResponse: 291 | return self._mblog_node.get('text') if self._mblog_node is not None else None 292 | 293 | @property 294 | def source(self) -> _StrFieldResponse: 295 | return self._mblog_node.get('source') if self._mblog_node is not None else None 296 | 297 | @property 298 | def user(self) -> UserMeta: 299 | return UserMeta(user_node=self._mblog_node.get('user')) if self._mblog_node is not None else None 300 | 301 | @property 302 | def retweeted_status(self): 303 | return MBlogMeta(mblog_node=self._mblog_node.get('retweeted_status')) \ 304 | if self._mblog_node.get('retweeted_status') else None 305 | 306 | @property 307 | def reposts_count(self) -> _IntFieldResponse: 308 | return self._mblog_node.get('reposts_count') if self._mblog_node is not None else None 309 | 310 | @property 311 | def comments_count(self) -> _IntFieldResponse: 312 | return self._mblog_node.get('comments_count') if self._mblog_node is not None else None 313 | 314 | @property 315 | def obj_ext(self) -> _StrFieldResponse: 316 | return self._mblog_node.get('obj_ext') if self._mblog_node is not None else None 317 | 318 | @property 319 | def raw_text(self) -> _StrFieldResponse: 320 | return self._mblog_node.get('raw_text') if self._mblog_node is not None else None 321 | 322 | @property 323 | def bid(self) -> _StrFieldResponse: 324 | return self._mblog_node.get('bid') if self._mblog_node is not None else None 325 | 326 | @property 327 | def pics_node(self): 328 | return [PicMeta(pic) for pic in self._mblog_node.get('pics')] \ 329 | if self._mblog_node.get('pics') is not None else None 330 | 331 | 332 | class TweetMeta(object): 333 | """ weibo tweet meta data""" 334 | 335 | __slots__ = ['_card_node', '_mblog'] 336 | 337 | def __init__(self, card_node: dict) -> None: 338 | self._card_node = card_node 339 | self._mblog = MBlogMeta(mblog_node=self._card_node.get('mblog')) if self._card_node is not None else None 340 | 341 | @property 342 | def raw_card_node(self) -> dict: 343 | return self._card_node 344 | 345 | @raw_card_node.setter 346 | def raw_card_node(self, value: dict): 347 | self._card_node = value 348 | 349 | @property 350 | def itemid(self) -> _StrFieldResponse: 351 | return self._card_node.get('itemid') if self._card_node is not None else None 352 | 353 | @property 354 | def scheme(self) -> _StrFieldResponse: 355 | return self._card_node.get('scheme') if self._card_node is not None else None 356 | 357 | @property 358 | def mblog(self) -> MBlogMeta: 359 | return self._mblog 360 | 361 | @mblog.setter 362 | def mblog(self, value: MBlogMeta): 363 | self._mblog = value 364 | 365 | 366 | _ListTweetMetaFieldResponse = List[TweetMeta] 367 | 368 | """ 369 | - data: 370 | - cardlistInfo: 371 | - containerid 372 | 373 | - cards: 374 | - mblog: 375 | - retweeted_status 376 | .... 377 | - user 378 | .... 379 | """ 380 | 381 | 382 | class WeiboTweetParser(object): 383 | __slots__ = ['tweet_containerid', '_tweet_get_index_reponse', '_cards_node'] 384 | 385 | def __init__(self, tweet_get_index_response: dict = None, tweet_containerid: str = None) -> None: 386 | if tweet_get_index_response is None and tweet_containerid is None: 387 | raise WeiboApiException( 388 | "WeiboTweetParser#__init__ tweet_get_index_response and tweet_containerid is none !") 389 | 390 | self.tweet_containerid = tweet_containerid 391 | 392 | self._tweet_get_index_reponse = weibo_tweets(containerid=tweet_containerid) \ 393 | if tweet_get_index_response is None and tweet_containerid is not None \ 394 | else tweet_get_index_response 395 | 396 | self._cards_node = [TweetMeta(card_node=card) for card in list( 397 | filter(lambda card: card.get('card_group') is None, 398 | self._tweet_get_index_reponse.get('data').get('cards')))] 399 | 400 | @property 401 | def raw_tweet_response(self) -> _JSONResponse: 402 | return self._tweet_get_index_reponse 403 | 404 | @raw_tweet_response.setter 405 | def raw_tweet_response1(self, value: dict): 406 | self._tweet_get_index_reponse = value 407 | 408 | @property 409 | def card_list_info_node(self) -> _JSONResponse: 410 | return self._tweet_get_index_reponse.get('data').get('cardlistInfo') 411 | 412 | @property 413 | def cards_node(self) -> _ListTweetMetaFieldResponse: 414 | # skip recommended weibo tweet 415 | return self._cards_node 416 | 417 | @cards_node.setter 418 | def cards_node(self, value): 419 | self._cards_node = value 420 | 421 | @property 422 | def tweet_containerid_node(self) -> _StrFieldResponse: 423 | return self.card_list_info_node.get('containerid') 424 | 425 | @property 426 | def total(self) -> _IntFieldResponse: 427 | return self.card_list_info_node.get('page') 428 | 429 | def __repr__(self): 430 | return r"".format(repr(self.tweet_containerid_node)) 431 | 432 | 433 | class WeiboGetIndexParser(object): 434 | __slots__ = ['get_index_api_response', 'uid'] 435 | 436 | def __init__(self, get_index_api_response: dict = None, uid: str = None) -> None: 437 | if get_index_api_response is None and uid is None: 438 | raise WeiboApiException("In WeiboGetIndexParser , get_index_api_response and uid can not be None . ") 439 | elif get_index_api_response is not None: 440 | self.get_index_api_response = get_index_api_response 441 | self.uid = self.user_info_node.get('id') 442 | elif uid is not None: 443 | self.uid = uid 444 | self.get_index_api_response = weibo_getIndex(uid_value=self.uid) 445 | 446 | @property 447 | def raw_response(self) -> _JSONResponse: 448 | return self.get_index_api_response 449 | 450 | @property 451 | def user_info_node(self) -> _JSONResponse: 452 | return self.get_index_api_response.get('data').get('userInfo') 453 | 454 | @property 455 | def tabs_node(self) -> _JSONResponse: 456 | return self.get_index_api_response.get('data').get('tabsInfo').get('tabs') 457 | 458 | @property 459 | def fans_scheme_node(self) -> str: 460 | return self.get_index_api_response.get('data').get('fans_scheme') 461 | 462 | @property 463 | def follow_scheme_node(self) -> str: 464 | return self.get_index_api_response.get('data').get('follow_scheme') 465 | 466 | @property 467 | def scheme_node(self) -> _StrFieldResponse: 468 | return self.get_index_api_response.get('data').get('scheme') 469 | 470 | @property 471 | def user(self): 472 | """structure is similary with user""" 473 | return UserMeta(user_node=self.user_info_node) 474 | 475 | @property 476 | def profile_containerid(self) -> _StrFieldResponse: 477 | # weibo second profile api 478 | if isinstance(self.tabs_node, dict): 479 | return self.tabs_node.get('0').get('containerid') 480 | # weibo first profile api 481 | elif isinstance(self.tabs_node, list): 482 | return list(filter(lambda item: item.get('tab_type') == 'profile', self.tabs_node))[0].get('containerid') 483 | return None 484 | 485 | @property 486 | def weibo_containerid(self) -> _StrFieldResponse: 487 | # weibo second profile api 488 | if isinstance(self.tabs_node, dict): 489 | return self.tabs_node.get('1').get('containerid') 490 | # weibo first profile api 491 | elif isinstance(self.tabs_node, list): 492 | return list(filter(lambda item: item.get('tab_type') == 'weibo', self.tabs_node))[0] 493 | return None 494 | 495 | # this property is not exist in first weibo profile api 496 | @property 497 | def album_containerid(self) -> _StrFieldResponse: 498 | return self.tabs_node.get('3').get('containerid') if isinstance(self.tabs_node, dict) else None 499 | 500 | # two sample api 501 | # https://m.weibo.cn/api/container/getIndex?type=uid&value=1111681197 502 | # https://m.weibo.cn/api/container/getIndex?type=uid&value=1843242321 503 | @property 504 | def tweet_containerid(self): 505 | if isinstance(self.tabs_node, list): 506 | _weibo_containerid = list(filter(lambda tab: tab.get('tab_type') == 'weibo', self.tabs_node))[0].get( 507 | 'containerid') 508 | if _weibo_containerid.__contains__('WEIBO_SECOND_PROFILE_WEIBO'): 509 | return re.findall(r'(.+?)WEIBO_SECOND_PROFILE_WEIBO_PAY_BILL', 510 | list(filter(lambda tab: tab.get('tab_type') == 'weibo', self.tabs_node))[0].get( 511 | 'containerid'))[0] 512 | else: 513 | return _weibo_containerid 514 | elif isinstance(self.tabs_node, dict): 515 | _response_include_tweetid = weibo_tweets(containerid=self.profile_containerid, page=0) 516 | _cards = _response_include_tweetid.get('data').get('cards') 517 | return re.findall(r'containerid=(.+?)WEIBO_SECOND', 518 | list(filter(lambda _card: _card.get('itemid') == 'more_weibo', _cards))[0] 519 | .get('scheme'))[0] 520 | else: 521 | return None 522 | 523 | @property 524 | def follow_containerid_second(self): 525 | return re.findall(r'lfid=(.+?$)', self.scheme_node)[0] + '_-_FANS' if self.scheme_node is not None else None 526 | 527 | @property 528 | def follower_containerid_second(self): 529 | return re.findall(r'lfid=(.+?$)', self.scheme_node)[ 530 | 0] + '_-_FOLLOWERS' if self.scheme_node is not None else None 531 | 532 | @property 533 | def follower_containerid(self): 534 | return re.findall(r'containerid=(.+?)&luicode', self.fans_scheme_node)[0].replace("_intimacy", "") 535 | 536 | @property 537 | def follow_containerid(self): 538 | return re.findall(r'containerid=(.+?)&luicode', self.follow_scheme_node)[0].replace("recomm", "") 539 | 540 | def __repr__(self): 541 | return r"".format(repr(self.user.id)) 542 | 543 | 544 | # ========================= FollowAndFollower ============================ 545 | 546 | class FollowAndFollowerParser(object): 547 | __slots__ = ['follow_and_follower_response', 'follow_and_follower_containerid'] 548 | 549 | def __init__(self, follow_and_follower_response: dict, follow_and_follower_containerid: str = None): 550 | self.follow_and_follower_response = follow_and_follower_response 551 | self.follow_and_follower_containerid = follow_and_follower_containerid if follow_and_follower_containerid is not None else self.containerid 552 | 553 | @property 554 | def raw_follow_and_follower_response(self): 555 | return self.follow_and_follower_response 556 | 557 | @property 558 | def is_validate(self): 559 | if self.raw_follow_and_follower_response is None: 560 | return False 561 | if self.raw_follow_and_follower_response.get('ok') == 0: 562 | return False 563 | return True 564 | 565 | @property 566 | def data_node(self): 567 | return self.raw_follow_and_follower_response.get('data') if self.is_validate else None 568 | 569 | @property 570 | def count(self): 571 | return self.data_node.get('count') if self.data_node is not None else None 572 | 573 | @property 574 | def user_list(self): 575 | if self.data_node is None: 576 | return None 577 | return [UserMeta(user_node=card.get('user')) for card in self.data_node.get('cards')] 578 | 579 | @property 580 | def containerid(self): 581 | return self.raw_follow_and_follower_response.get('data').get('cardlistInfo').get('containerid') 582 | 583 | def __repr__(self): 584 | return "".format(repr(self.containerid)) 585 | 586 | 587 | class RealTimeHotWordResponse(object): 588 | __slots__ = ['_sequence', '_desc', '_hot', '_url'] 589 | 590 | def __init__(self, ): 591 | self._sequence = 0 592 | self._desc = "" 593 | self._hot = 0 594 | self._url = "" 595 | 596 | @property 597 | def sequence(self): 598 | return self._sequence 599 | 600 | @sequence.setter 601 | def sequence(self, sequence): 602 | self._sequence = sequence 603 | 604 | @property 605 | def desc(self): 606 | return self._desc 607 | 608 | @desc.setter 609 | def desc(self, desc): 610 | self._desc = desc 611 | 612 | @property 613 | def hot(self): 614 | return self._hot 615 | 616 | @hot.setter 617 | def hot(self, hot): 618 | self._hot = hot 619 | 620 | @property 621 | def url(self): 622 | return self._url 623 | 624 | @url.setter 625 | def url(self, url): 626 | self._url = url 627 | 628 | def __repr__(self): 629 | return "" % ( 630 | self._sequence, self._desc, self._hot, self._url,) 631 | -------------------------------------------------------------------------------- /weibo_base/weibo_typing.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Verion: 1.0 5 | Author: Helixcs 6 | Site: https://github.com/Xarrow/weibo-scraper 7 | File: weibo_typing.py 8 | Time: 11/25/18 9 | """ 10 | 11 | from typing import Optional, Dict 12 | 13 | _JSONResponse = Optional[Dict] 14 | _StrFieldResponse = Optional[str] 15 | _IntFieldResponse = Optional[int] 16 | -------------------------------------------------------------------------------- /weibo_base/weibo_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Author: Helixcs 5 | Site: https://github.com/Xarrow/weibo-scraper 6 | File: weibo_util.py 7 | Time: 5/19/18 8 | Descripton: weibo_util is in common use 9 | """ 10 | import logging 11 | import threading 12 | import sys 13 | import requests 14 | from contextlib import contextmanager 15 | from time import time 16 | 17 | level = logging.INFO 18 | ws_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' 19 | ws_datefmt = '%Y-%m-%d %H:%M' 20 | logging.basicConfig(level=level, format=ws_format, datefmt=ws_datefmt) 21 | logger = logging.getLogger(__name__) 22 | logger.setLevel(level) 23 | 24 | 25 | class WeiboApiException(Exception): 26 | def __init__(self, message): 27 | self.message = message 28 | 29 | 30 | class WeiboScraperException(Exception): 31 | def __init__(self, message): 32 | self.message = message 33 | 34 | 35 | def set_debug(): 36 | logger.setLevel(logging.DEBUG) 37 | 38 | 39 | def rt_logger(func): 40 | def func_wrapper(*args, **kwargs): 41 | __start_time = int(time() * 1000) 42 | __response = func(*args, **kwargs) 43 | __end_time = int(time() * 1000) 44 | if is_debug: 45 | logger.debug("[ws] [rt_logger] func: [ %s ], args:[ %s ] execute spend: [ %s ms ] ." % ( 46 | func.__name__, (args, kwargs), (__end_time - __start_time))) 47 | return __response 48 | 49 | return func_wrapper 50 | 51 | 52 | def api_ex_handle(func): 53 | pass 54 | 55 | 56 | def ws_handle(func): 57 | def func_wrapper(*args, **kwargs): 58 | is_debug = logger.level == logging.DEBUG 59 | start_time = int(time() * 1000) 60 | response = None 61 | try: 62 | response = func(*args, **kwargs) 63 | return response 64 | except WeiboApiException as ex: 65 | pass 66 | except Exception as ex: 67 | exc_type, exc_obj, exc_tb = sys.exc_info() 68 | _ext = [] 69 | _ = set() 70 | handle_exec_tb(exc_tb, _ext, _) 71 | logger.error("[exception] function:[{0}] exception , params:{1}, response:{2} ,ex:{4}, stack:{3}".format( 72 | func.__name__, 73 | (args, kwargs), 74 | ex, 75 | response, 76 | _ext)) 77 | raise ex 78 | finally: 79 | if is_debug: 80 | logger.debug( 81 | "[invoke process] function:[{0}] , params:{1}, response:{2} ".format( 82 | func.__name__, 83 | (args, kwargs), 84 | response)) 85 | 86 | end_time = int(time() * 1000) 87 | logger.debug("[cost time] function:[%s], args:[%s] execute spend:[%s ms]" % ( 88 | func.__name__, 89 | (args, kwargs), 90 | (end_time - start_time))) 91 | 92 | return func_wrapper 93 | 94 | 95 | def handle_exec_tb(tb_exec, _ext: list, cls_methods_tag_set: set): 96 | if not hasattr(tb_exec, "tb_frame"): 97 | return 98 | _fileName = _cls_methods_tag = tb_exec.tb_frame.f_code.co_filename 99 | _parameters = {} 100 | for k, v in tb_exec.tb_frame.f_locals.items(): 101 | if isinstance(v, object): 102 | if k == 'func': 103 | v = v.__name__ 104 | _cls_methods_tag += "_" + v 105 | if _cls_methods_tag in cls_methods_tag_set: 106 | return 107 | cls_methods_tag_set.add(_cls_methods_tag) 108 | else: 109 | v = str(v) 110 | _parameters[k] = v 111 | _ret = {"fname": _fileName, 112 | "lineno": tb_exec.tb_lineno, 113 | "parameters": _parameters} 114 | _ext.append(_ret) 115 | handle_exec_tb(tb_exec.tb_next, _ext, cls_methods_tag_set) 116 | 117 | 118 | class AntiStrategy(object): 119 | pass 120 | 121 | 122 | class RequestProxy(object): 123 | def __init__(self): 124 | super().__init__() 125 | 126 | def session(self): 127 | return requests.Session() 128 | 129 | def requests_proxy(self, method, url, **kwargs): 130 | """ 131 | request proxy 132 | """ 133 | # print("before request") 134 | proxies = { 135 | 'http': 'socks5://127.0.0.1:1086', 136 | 'https': 'socks5://127.0.0.1:1086', 137 | } 138 | # kwargs.setdefault("proxies", proxies) 139 | response = requests.request(method, url, **kwargs) 140 | # print("after request") 141 | return response 142 | 143 | def get(self, url, params=None, **kwargs): 144 | """ 145 | @see requests.sessions.Session 146 | """ 147 | kwargs.setdefault('allow_redirects', True) 148 | return self.requests_proxy('GET', url, params=params, **kwargs) 149 | 150 | def post(self, url, data=None, json=None, **kwargs): 151 | return self.requests_proxy('post', url, data=data, json=json, **kwargs) 152 | 153 | 154 | @contextmanager 155 | def open_file(file_name: str): 156 | file = open(file=file_name, mode='wb') 157 | yield file 158 | file.flush() 159 | file.close() 160 | 161 | 162 | class Timer(object): 163 | __slots__ = ['_name', '_timer', '_fn', '_interval', '_ignore_ex', '_on_result', '_on_exception', 164 | '_args', '_kwargs'] 165 | 166 | def __init__(self, 167 | name, 168 | fn, 169 | interval=7, 170 | *args, 171 | **kwargs): 172 | """ 173 | :param name: timer name 174 | :param fn: function which scheduler 175 | :param interval: scheduler interval, default 7s 176 | :param args: args in function 177 | :param kwargs: kwargs in function 178 | """ 179 | # 180 | self._name = name 181 | # Thread.Timer 182 | self._timer = None 183 | # function which callable 184 | self._fn = fn 185 | # timer interval default 7s 186 | self._interval = interval 187 | # whether ignore invoke exception 188 | self._ignore_ex = False 189 | self._on_result = None 190 | self._on_exception = None 191 | # function args 192 | self._args = args 193 | # function kwargs 194 | self._kwargs = kwargs 195 | 196 | @property 197 | def name(self): 198 | return self._name 199 | 200 | def set_name(self, name): 201 | self._name = name 202 | return self 203 | 204 | @property 205 | def fn(self): 206 | return self._fn 207 | 208 | def set_fn(self, fn): 209 | self._fn = fn 210 | return self 211 | 212 | @property 213 | def interval(self, ): 214 | return self._interval 215 | 216 | def set_interval(self, interval): 217 | self._interval = interval 218 | return self 219 | 220 | @property 221 | def ignore_ex(self): 222 | return self._ignore_ex 223 | 224 | def set_ignore_ex(self, ignore_ex): 225 | self._ignore_ex = ignore_ex 226 | return self 227 | 228 | @property 229 | def on_result(self): 230 | return self._on_result 231 | 232 | def set_on_result(self, fn): 233 | self._on_result = fn 234 | return self 235 | 236 | @property 237 | def on_exception(self): 238 | return self._on_exception 239 | 240 | def set_on_exception(self, fn): 241 | self._on_exception = fn 242 | return self 243 | 244 | def alive(self): 245 | if self._timer is None: 246 | return False 247 | return self._timer.is_alive() 248 | 249 | def scheduler(self): 250 | try: 251 | res = self._fn(*self._args, **self._kwargs) 252 | if self._on_result: 253 | self._on_result(res) 254 | except Exception as ex: 255 | if self._on_exception: 256 | self._on_exception(ex) 257 | if not self._ignore_ex: 258 | # stop timer 259 | raise ex 260 | self._timer = threading.Timer(self._interval, self.scheduler, ) 261 | self._timer.start() 262 | 263 | def cancel(self): 264 | if self._timer: 265 | self._timer.cancel() 266 | 267 | 268 | class TimerManager(object): 269 | def __init__(self, ): 270 | self._timers_container = {} 271 | self._executed = False 272 | 273 | def all_timers(self): 274 | return self._timers_container 275 | 276 | def add_timer(self, timer): 277 | self._timers_container[timer.name] = timer 278 | return self 279 | 280 | def execute(self): 281 | """ 282 | scheduler all timer in manager 283 | :return: None 284 | """ 285 | if self._executed: 286 | return 287 | for name, timer in self._timers_container.items(): 288 | if timer.alive(): 289 | continue 290 | timer.scheduler() 291 | self._executed = True 292 | 293 | def cancel_timer(self, timer_name=None, ): 294 | """ 295 | cancel timer , and nacos timer still in container 296 | it can execute again. 297 | :param timer_name: 298 | :return: None 299 | """ 300 | timer = self._timers_container.get(timer_name) 301 | if timer: 302 | timer.cancel() 303 | 304 | def cancel(self): 305 | """ 306 | cancel all timer in container 307 | :return: None 308 | """ 309 | for _, timer in self._timers_container.items(): 310 | timer.cancel() 311 | 312 | def stop_timer(self, timer_name): 313 | """ 314 | cancel nacos timer and remove it from timer container 315 | :param timer_name: 316 | :return: None 317 | """ 318 | self.cancel_timer(timer_name) 319 | self._timers_container.pop(timer_name) 320 | 321 | def stop(self): 322 | """ 323 | remove all timer, and it can not execute again 324 | """ 325 | self.cancel() 326 | self._timers_container.clear() 327 | -------------------------------------------------------------------------------- /weibo_scraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Author: helixcs 5 | Site: https://github.com/Xarrow/weibo-scraper 6 | File: weibo_scraper.py 7 | Time: 3/16/18 8 | """ 9 | import datetime 10 | import sys 11 | from typing import Iterator, Optional, List, Dict 12 | 13 | from weibo_base.weibo_api import weibo_tweets, weibo_getIndex, weibo_second, weibo_comments, realtime_hotword 14 | from weibo_base.weibo_component import exist_get_uid, get_tweet_containerid 15 | from weibo_base.weibo_parser import \ 16 | WeiboCommentParser, \ 17 | WeiboGetIndexParser, \ 18 | UserMeta, \ 19 | WeiboTweetParser, \ 20 | FollowAndFollowerParser, \ 21 | RealTimeHotWordResponse 22 | from weibo_base.weibo_util import ws_handle, WeiboScraperException 23 | 24 | try: 25 | assert sys.version_info.major == 3 26 | assert sys.version_info.minor >= 6 27 | except AssertionError: 28 | raise RuntimeError('weibo-scraper requires Python3.6+ !') 29 | 30 | now = datetime.datetime.now() 31 | CURRENT_TIME = now.strftime('%Y-%m-%d %H:%M:%S') 32 | CURRENT_YEAR = now.strftime('%Y') 33 | CURRENT_YEAR_WITH_DATE = now.strftime('%Y-%m-%d') 34 | 35 | _TweetsResponse = Optional[Iterator[Dict]] 36 | _UserMetaResponse = Optional[UserMeta] 37 | _WeiboGetIndexResponse = Optional[WeiboGetIndexParser] 38 | 39 | 40 | @ws_handle 41 | def get_weibo_tweets_by_name(name: str, pages: int = None) -> _TweetsResponse: 42 | """ 43 | Get raw weibo tweets by nick name without any authorization 44 | >>> from weibo_scraper import get_weibo_tweets_by_name 45 | >>> for tweet in get_weibo_tweets_by_name(name='嘻红豆', pages=1): 46 | >>> print(tweet) 47 | :param name: nick name which you want to search 48 | :param pages: pages ,default all pages 49 | :return: _TweetsResponse 50 | """ 51 | if name == '': 52 | raise WeiboScraperException("`name` can not be blank!") 53 | res = exist_get_uid(name=name) 54 | exist = res.get("exist") 55 | uid = res.get("uid") 56 | if exist: 57 | inner_tweet_container_id = get_tweet_containerid(uid=uid) 58 | yield from get_weibo_tweets(tweet_container_id=inner_tweet_container_id, pages=pages) 59 | else: 60 | raise WeiboScraperException("`{name}` can not find!".format(name=name)) 61 | 62 | @ws_handle 63 | def get_weibo_tweets(tweet_container_id: str, pages: int = None) -> _TweetsResponse: 64 | """ 65 | Get weibo tweets from mobile without authorization,and this containerid exist in the api of 66 | 67 | Compatibility: 68 | New Api 69 | 1. Search by Nname and get uid by this api "https://m.weibo.cn/api/container/getIndex?queryVal=来去之间&containerid=100103type%3D3%26q%3D来去之间" 70 | 2. Get profile info by uid , https://m.weibo.cn/api/container/getIndex?type=uid&value=1111681197 71 | 3. https://m.weibo.cn/api/container/getIndex?containerid=2302831111681197 72 | 4. Get weibo tweets by container in node of "tabs" ,https://m.weibo.cn/api/container/getIndex?containerid=2304131111681197_-_&page=6891 73 | >>> from weibo_scraper import get_weibo_tweets 74 | >>> for tweet in get_weibo_tweets(tweet_container_id='1076033637346297',pages=1): 75 | >>> print(tweet) 76 | :param tweet_container_id: request weibo tweets directly by tweet_container_id 77 | :param pages :default None 78 | :return _TweetsResponse 79 | """ 80 | 81 | # current_page_index = 1 82 | 83 | def gen(_inner_current_page=1): 84 | while True: 85 | if pages is not None and _inner_current_page > pages: 86 | break 87 | _response_json = weibo_tweets(containerid=tweet_container_id, page=_inner_current_page) 88 | # skip bad request 89 | if _response_json is None: 90 | continue 91 | # break failed response 92 | elif _response_json.get("ok") != 1: 93 | break 94 | # break end tweet 95 | elif _response_json.get('data').get("cards")[0].get('name') == '暂无微博': 96 | break 97 | _cards = _response_json.get('data').get("cards") 98 | for _card in _cards: 99 | # skip recommended tweets 100 | if _card.get("card_group"): 101 | continue 102 | # just yield field of mblog 103 | yield _card 104 | _inner_current_page += 1 105 | 106 | yield from gen() 107 | 108 | @ws_handle 109 | def get_formatted_weibo_tweets_by_name(name: str, 110 | with_comments: bool = False, 111 | pages: int = None) -> _TweetsResponse: 112 | """ 113 | Get formatted weibo tweets by nick name without any authorization 114 | >>> from weibo_scraper import get_formatted_weibo_tweets_by_name 115 | >>> result_iterator = get_formatted_weibo_tweets_by_name(name='嘻红豆', pages=None) 116 | >>> for user_meta in result_iterator: 117 | >>> for tweetMeta in user_meta.cards_node: 118 | >>> print(tweetMeta.mblog.text) 119 | :param name: nick name which you want to search 120 | :param with_comments , with comments 121 | :param pages: pages ,default all pages 122 | :return: _TweetsResponse 123 | """ 124 | if name == '': 125 | raise WeiboScraperException("name can not be blank!") 126 | egu_res = exist_get_uid(name=name) 127 | exist = egu_res.get("exist") 128 | uid = egu_res.get("uid") 129 | if exist: 130 | inner_tweet_containerid = get_tweet_containerid(uid=uid) 131 | yield from get_weibo_tweets_formatted(tweet_container_id=inner_tweet_containerid, 132 | with_comments=with_comments, 133 | pages=pages) 134 | else: 135 | raise WeiboScraperException("`{name}` can not find!".format(name=name)) 136 | 137 | @ws_handle 138 | def get_weibo_tweets_formatted(tweet_container_id: str, with_comments: bool, pages: int = None, 139 | max_item_limit: int = None) -> _TweetsResponse: 140 | """ 141 | Get weibo formatted tweets by container id 142 | 143 | Compatibility: 144 | New Api 145 | 1. Get uid by searching name via "https://m.weibo.cn/api/container/getIndex?queryVal=来去之间&containerid=100103type%3D3%26q%3D来去之间" 146 | 2. Get weibo profile containerid by uid via "https://m.weibo.cn/api/container/getIndex?type=uid&value=1111681197" 147 | 3. Get weibo tweet containerid by profile containerid via "https://m.weibo.cn/api/container/getIndex?containerid=2302831111681197" 148 | 3. Get weibo tweets by weet containerid via "https://m.weibo.cn/api/container/getIndex?containerid=2304131111681197_-_&page=6891" 149 | >>> from weibo_scraper import get_weibo_tweets_formatted 150 | >>> for tweet in get_weibo_tweets_formatted(tweet_container_id='1076033637346297',pages=1): 151 | >>> print(tweet) 152 | :param max_item_limit: 153 | :param with_comments: 154 | :param tweet_container_id: request weibo tweets directly by tweet_container_id 155 | :param pages :default None 156 | :return _TweetsResponse 157 | """ 158 | # TODO max items limit 159 | current_total_item = 0 160 | 161 | def weibo_tweets_gen(_inner_current_page=1): 162 | while True: 163 | if pages is not None and _inner_current_page > pages: 164 | break 165 | tweet_response_json = weibo_tweets(containerid=tweet_container_id, page=_inner_current_page) 166 | # skip bad request 167 | if tweet_response_json is None: 168 | continue 169 | elif tweet_response_json.get("ok") != 1: 170 | break 171 | weibo_tweet_parser = WeiboTweetParser(tweet_get_index_response=tweet_response_json) 172 | yield weibo_tweet_parser 173 | _inner_current_page += 1 174 | 175 | def weibo_comments_gen(): 176 | wtg = weibo_tweets_gen() 177 | for i in wtg: 178 | for j in i.cards_node: 179 | id = j.mblog.id 180 | mid = j.mblog.mid 181 | global comment_response 182 | try: 183 | comment_response = weibo_comments(id=id, mid=mid) 184 | tweet_comment_parser = WeiboCommentParser(comment_response) 185 | j.mblog.comment_parser = tweet_comment_parser 186 | except Exception as ex: 187 | logger.error( 188 | "#get_weibo_tweets_formatted.weibo_comments_gen request weibo comment occurred an exception, ex=%s,comment_response=%s" % ( 189 | ex, comment_response)) 190 | j.mblog.comment_parser = None 191 | pass 192 | yield i 193 | 194 | if with_comments: 195 | yield from weibo_comments_gen() 196 | else: 197 | yield from weibo_tweets_gen() 198 | 199 | 200 | def weibo_get_index_parser(name: str = None, uid: str = None) -> _WeiboGetIndexResponse: 201 | """ 202 | Get weibo get index parser 203 | :param name: name 204 | :param uid: uid 205 | :return: _WeiboGetIndexResponse 206 | """ 207 | if uid is not None: 208 | _uid = uid 209 | elif name is not None: 210 | _egu_response = exist_get_uid(name=name) 211 | if not _egu_response.get('exist'): 212 | return None 213 | _uid = _egu_response.get('uid') 214 | else: 215 | return None 216 | _weibo_get_index_response_parser = WeiboGetIndexParser(get_index_api_response=weibo_getIndex(uid_value=_uid)) 217 | if _weibo_get_index_response_parser.raw_response is None \ 218 | or _weibo_get_index_response_parser.raw_response.get('data') == 0: 219 | return None 220 | return _weibo_get_index_response_parser 221 | 222 | @ws_handle 223 | def get_weibo_profile(name: str = None, uid: str = None) -> _UserMetaResponse: 224 | """ 225 | Get weibo profile 226 | >>> from weibo_scraper import get_weibo_profile 227 | >>> weibo_profile = get_weibo_profile(name='嘻红豆',) 228 | :param uid: uid 229 | :param name: name 230 | :return: UserMeta 231 | """ 232 | weibo_get_index_parser_response = weibo_get_index_parser(name=name, uid=uid) 233 | return weibo_get_index_parser_response.user if weibo_get_index_parser_response is not None else None 234 | 235 | 236 | FOLLOWER_FLAG = 1 237 | 238 | FOLLOW_FLAG = 0 239 | 240 | 241 | def get_follows_and_followers(name: str = None, 242 | uid: str = None, 243 | pages: int = None, 244 | invoke_flag: int = FOLLOW_FLAG): 245 | """ 246 | Get follows and followers by name or uid limit by pages 247 | :param invoke_flag: 0-follow , 1-follower 248 | :param name: 249 | :param uid: 250 | :param pages: 251 | :return: 252 | """ 253 | 254 | def gen_follows_and_followers(_inner_current_page=1, _total_items=0): 255 | while True: 256 | # stop max pages 257 | if pages is not None and _inner_current_page > pages: 258 | break 259 | if invoke_flag == FOLLOW_FLAG: 260 | _weibo_follows_and_followers_second_response = weibo_second( 261 | containerid=weibo_get_index_parser_response.follow_containerid_second, 262 | page=_inner_current_page) 263 | else: 264 | _weibo_follows_and_followers_second_response = weibo_second( 265 | containerid=weibo_get_index_parser_response.follower_containerid_second, 266 | page=_inner_current_page) 267 | # skip bad request 268 | if _weibo_follows_and_followers_second_response is None: 269 | continue 270 | # stop end page 271 | if _weibo_follows_and_followers_second_response.get('ok') == 0: 272 | break 273 | _follow_and_follower_parser = FollowAndFollowerParser( 274 | follow_and_follower_response=_weibo_follows_and_followers_second_response) 275 | yield _follow_and_follower_parser 276 | _inner_current_page += 1 277 | 278 | weibo_get_index_parser_response = weibo_get_index_parser(name=name, uid=uid) 279 | if weibo_get_index_parser_response is None: 280 | yield [] 281 | else: 282 | yield from gen_follows_and_followers() 283 | 284 | 285 | def get_follows(name: str = None, uid: str = None, pages: int = None, max_item_limit: int = None): 286 | """ 287 | 288 | :param max_item_limit: 289 | :param name: 290 | :param uid: 291 | :param pages: 292 | :return: 293 | """ 294 | current_total_pages = 0 295 | follows_iterator = get_follows_and_followers(name=name, uid=uid, pages=pages) 296 | for follow in follows_iterator: 297 | if follow is None: 298 | yield None 299 | else: 300 | for user in follow.user_list: 301 | if max_item_limit is not None and current_total_pages >= max_item_limit: 302 | return 303 | yield user 304 | current_total_pages += 1 305 | 306 | 307 | def get_followers(name: str = None, 308 | uid: str = None, 309 | pages: int = None, 310 | max_item_limit: int = None): 311 | """ 312 | Get weibo follower by name, 粉丝 313 | XIHONGDOU's fans 314 | https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_3637346297&page=0 315 | https://m.weibo.cn/api/container/getSecond?containerid=1005053637346297_-_FOLLOWERS&page=0 316 | 317 | :param max_item_limit: 318 | :param pages: 319 | :param uid: 320 | :param name: 321 | :return: 322 | 323 | """ 324 | current_total_pages = 0 325 | followers_iterator = get_follows_and_followers(name=name, uid=uid, pages=pages, invoke_flag=1) 326 | for follower in followers_iterator: 327 | if follower is None: 328 | yield None 329 | else: 330 | for user in follower.user_list: 331 | if max_item_limit is not None and current_total_pages >= max_item_limit: 332 | return 333 | yield user 334 | current_total_pages += 1 335 | 336 | 337 | @ws_handle 338 | def get_realtime_hotwords() -> List[RealTimeHotWordResponse]: 339 | """ 340 | get real time hot words 341 | """ 342 | hot_words = realtime_hotword() 343 | if None is hot_words: 344 | return [] 345 | 346 | index = 1 347 | response = [] 348 | for item in hot_words.get('data').get('cards')[0].get('card_group'): 349 | if item.get('promotion'): 350 | continue 351 | rthr = RealTimeHotWordResponse() 352 | rthr.sequence = index 353 | rthr.desc = item.get('desc') 354 | rthr.hot = 0 if item.get('desc_extr') is None else item.get('desc_extr') 355 | rthr.url = item.get('scheme') 356 | response.append(rthr) 357 | index += 1 358 | 359 | return response 360 | # -------------------- simplify method name ---------------- 361 | -------------------------------------------------------------------------------- /weibo_scraper_cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Verion: 1.0 5 | Since : 3.6 6 | Author: zhangjian 7 | Site: https://github.com/Xarrow/weibo-scraper 8 | File: cli 9 | Time: 2018/12/18 10 | 11 | Add New Functional cli 12 | """ 13 | 14 | import argparse 15 | import os 16 | 17 | from prompt_toolkit import prompt 18 | from prompt_toolkit.completion import WordCompleter 19 | 20 | 21 | def cli(): 22 | """weibo-cli""" 23 | weibo_scraper_name = "weibo-scraper" 24 | weibo_scraper_version = "1.0.7 beta" 25 | weibo_scraper_description = weibo_scraper_name + "-" + weibo_scraper_version 26 | parser = argparse.ArgumentParser(description=weibo_scraper_description, 27 | prog=weibo_scraper_name, 28 | formatter_class=argparse.RawDescriptionHelpFormatter) 29 | 30 | parser.add_argument("-u", type=str, required=False, help="username [nickname] which want to exported") 31 | parser.add_argument("-p", type=int, required=False, default=None, help="pages which exported [ default 1 page ]") 32 | 33 | parser.add_argument("-o", type=str, required=False, default=os.getcwd(), 34 | help="output file path which expected [ default 'current dir' ]") 35 | parser.add_argument("-f", "--format", type=str, required=False, default="txt", 36 | help="format which expected [ default 'txt' ]") 37 | parser.add_argument("-efn", "--exported_file_name", required=False, default=None, help="file name which expected") 38 | parser.add_argument("-s", "--simplify", action="store_true", help="simplify available info") 39 | parser.add_argument("-d", "--debug", action="store_true", help="open debug mode") 40 | parser.add_argument("--more", action="store_true", help="more") 41 | parser.add_argument("-v", "--version", action="store_true", help="weibo scraper version") 42 | 43 | args = parser.parse_args() 44 | 45 | if args is None: 46 | print(args) 47 | 48 | if args.version: 49 | print(weibo_scraper_version) 50 | return 51 | 52 | if args.more: 53 | more_description = weibo_scraper_description 54 | more_description += " you can visit https://xarrow.github.io/weibo-scraper in detail" 55 | return 56 | 57 | if args.u is None: 58 | parser.print_help() 59 | return 60 | 61 | name = args.u 62 | pages = args.p 63 | is_simplify = args.simplify 64 | persistence_format = args.format 65 | export_file_path = args.o 66 | export_file_name = args.exported_file_name 67 | is_debug = args.debug 68 | 69 | persistence.dispatch(name=name, 70 | pages=pages, 71 | is_simplify=is_simplify, 72 | persistence_format=persistence_format, 73 | export_file_path=export_file_path, 74 | export_file_name=export_file_name, 75 | is_debug=is_debug) 76 | 77 | 78 | ws = ['', '', '', '', 'google', '-u'] 79 | 80 | 81 | class CompleterProxy(WordCompleter): 82 | def __init__(self, *args, **kwargs): 83 | super().__init__(*args, **kwargs) 84 | self.user_input = "" 85 | 86 | def get_completions( 87 | self, document, complete_event): 88 | self.user_input = document.text 89 | if self.user_input in ws: 90 | ws.remove(self.user_input) 91 | return super().get_completions(document=document, complete_event=complete_event) 92 | 93 | def bottom_toolbar(self): 94 | if self.user_input == '-u': 95 | return "Help: 微博名称" 96 | return "Help: " + self.user_input 97 | 98 | 99 | if __name__ == '__main__': 100 | # html_completer = CompleterProxy(ws) 101 | # text = prompt('weibo-scraper: ', completer=html_completer, bottom_toolbar=html_completer.bottom_toolbar) 102 | # print("weibo-scraper ", text) 103 | # import persistence 104 | cli() 105 | --------------------------------------------------------------------------------