├── .circleci └── config.yml ├── .codecov.yml ├── .coveragerc ├── .gitignore ├── .pep8speaks.yml ├── HISTORY.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── logparser ├── __init__.py ├── __version__.py ├── common.py ├── logparser.py ├── run.py ├── scrapylogparser.py ├── settings.py ├── telnet.py └── utils.py ├── requirements-tests.txt ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── conftest.py ├── demo_log.py ├── logs.zip ├── test_logparser.py ├── test_parse.py ├── test_settings.py ├── test_telnet.py ├── test_utils.py ├── test_z_cleantest.py └── utils.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.1 configuration file 2 | version: 2.1 3 | orbs: 4 | codecov: codecov/codecov@1.0.2 5 | allure: ayte/allure@0.1.3 6 | jobs: 7 | py39: &test-template 8 | docker: 9 | - image: cimg/python:3.9 10 | working_directory: ~/repo 11 | parameters: 12 | is-py27: 13 | type: boolean 14 | default: false 15 | allure-version: 16 | description: Allure version to use 17 | type: string 18 | default: 2.13.1 19 | allure-configuration-path: 20 | description: Path to Allure configuration, uses default one if omitted 21 | type: string 22 | default: /usr/local/share/allure/config/allure.yml 23 | allure-target-path: 24 | description: Path for report directory 25 | type: string 26 | default: allure-report 27 | allure-results-path: 28 | description: Path to directory with test results 29 | type: string 30 | default: allure-results 31 | allure-artifact-path: 32 | description: Path that will be used when storing result as artifact 33 | type: string 34 | default: Report/Allure 35 | steps: 36 | - run: 37 | name: Install telnet 38 | command: | 39 | sudo apt-get update && sudo apt-get install telnet 40 | - run: 41 | name: Install Java 11 42 | command: | 43 | sudo apt-get update 44 | sudo apt-get install -y openjdk-11-jdk 45 | - run: 46 | name: Set JAVA_HOME 47 | command: | 48 | echo "export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> $BASH_ENV 49 | source $BASH_ENV 50 | - checkout 51 | - when: 52 | condition: <> 53 | steps: 54 | - run: 55 | name: Create virtual env in PY2 56 | command: | 57 | virtualenv ./venv 58 | - unless: 59 | condition: <> 60 | steps: 61 | - run: 62 | name: Create virtual env in PY3 63 | command: | 64 | python3 -m venv venv 65 | - run: 66 | name: Install dependencies 67 | command: | 68 | # python3 -m venv venv 69 | # virtualenv ./venv 70 | . venv/bin/activate 71 | which python 72 | python --version 73 | pip install -r requirements.txt 74 | pip install -r requirements-tests.txt 75 | - run: 76 | name: Run tests 77 | command: | 78 | ls -la 79 | . venv/bin/activate 80 | flake8 . --count --exclude=./venv* --select=E9,F63,F7,F82 --show-source --statistics 81 | coverage erase 82 | coverage run --source=logparser -m pytest -s -vv -l --disable-warnings --alluredir=allure-results tests 83 | - run: 84 | name: Generate report 85 | command: | 86 | . venv/bin/activate 87 | coverage report 88 | coverage html 89 | coverage xml 90 | ls -la 91 | ls -la allure-results || echo 'ignore error' 92 | coveralls 93 | # https://discuss.circleci.com/t/make-custom-command-run-always-with-when-always/38957 94 | # https://circleci.com/docs/configuration-reference/#the-when-attribute 95 | when: always 96 | - store_artifacts: 97 | path: htmlcov 98 | - store_artifacts: 99 | path: coverage.xml 100 | - codecov/upload: 101 | file: coverage.xml 102 | # https://discuss.circleci.com/t/how-can-we-publish-pytest-results-to-circleci-using-allure-reports/37830/2 103 | # https://circleci.com/developer/orbs/orb/ayte/allure 104 | # - allure/install 105 | # - allure/report 106 | # https://circleci.com/docs/configuration-reference/#the-when-step 107 | - when: 108 | condition: 109 | equal: [ 1, 1 ] 110 | steps: 111 | - run: 112 | name: Allure archive download 113 | command: >- 114 | curl -L https://github.com/allure-framework/allure2/releases/download/<< 115 | parameters.allure-version >>/allure-commandline-<< parameters.allure-version >>.zip -o 116 | /tmp/allure.zip 117 | when: always 118 | - run: 119 | name: Archive extraction 120 | command: unzip /tmp/allure.zip 121 | when: always 122 | - run: 123 | name: Allure installation 124 | command: sudo mv allure-<< parameters.allure-version >> /usr/local/share/allure 125 | when: always 126 | - run: 127 | name: Allure binary symlinking 128 | command: sudo ln -s /usr/local/share/allure/bin/allure /usr/local/bin/allure 129 | when: always 130 | - when: 131 | condition: 132 | equal: [ 1, 1 ] 133 | steps: 134 | - run: 135 | name: >- 136 | Allure report generation (<< parameters.allure-results-path >> -> << 137 | parameters.allure-target-path >>) 138 | command: | 139 | allure generate \ 140 | --config << parameters.allure-configuration-path >> \ 141 | --report-dir << parameters.allure-target-path >> \ 142 | << parameters.allure-results-path >> 143 | when: always 144 | - store_artifacts: 145 | path: << parameters.allure-target-path >> 146 | destination: << parameters.allure-artifact-path >> 147 | py27: 148 | <<: *test-template 149 | docker: 150 | - image: cimg/python:2.7 151 | py38: 152 | <<: *test-template 153 | docker: 154 | - image: cimg/python:3.8 155 | py310: 156 | <<: *test-template 157 | docker: 158 | - image: cimg/python:3.10 159 | py311: 160 | <<: *test-template 161 | docker: 162 | - image: cimg/python:3.11 163 | py312: 164 | <<: *test-template 165 | docker: 166 | - image: cimg/python:3.12 167 | py313: 168 | <<: *test-template 169 | docker: 170 | - image: cimg/python:3.13 171 | workflows: 172 | test: 173 | jobs: 174 | # - py27: 175 | # is-py27: true 176 | - py38 177 | - py39 178 | - py310 179 | - py311 180 | - py312 181 | - py313 182 | -------------------------------------------------------------------------------- /.codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | notify: 3 | require_ci_to_pass: yes 4 | 5 | coverage: 6 | precision: 2 7 | round: down 8 | range: "70...100" 9 | 10 | status: 11 | project: yes 12 | patch: yes 13 | changes: no 14 | 15 | parsers: 16 | gcov: 17 | branch_detection: 18 | conditional: yes 19 | loop: yes 20 | method: no 21 | macro: no 22 | 23 | comment: 24 | layout: "header, reach, diff, flags, files" 25 | behavior: default 26 | require_changes: no 27 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | include = logparser/* 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tests/logs/ 2 | tests/demo_project/ 3 | 4 | venv/ 5 | 6 | *.pyc 7 | __pycache__/ 8 | 9 | instance/ 10 | 11 | .pytest_cache/ 12 | .coverage 13 | coverage.xml 14 | htmlcov/ 15 | 16 | dist/ 17 | build/ 18 | *.egg-info/ 19 | 20 | .idea 21 | -------------------------------------------------------------------------------- /.pep8speaks.yml: -------------------------------------------------------------------------------- 1 | scanner: 2 | diff_only: True 3 | linter: flake8 4 | 5 | flake8: 6 | max-line-length: 120 # Default is 79 in PEP 8 7 | ignore: # Errors and warnings to ignore 8 | - E266 9 | - E303 10 | - E128 11 | - E701 12 | - W504 13 | -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | Release History 2 | =============== 3 | [0.8.4](https://github.com/my8100/logparser/issues?q=is%3Aclosed+milestone%3A0.8.4) (2025-01-05) 4 | ------------------ 5 | - Others 6 | - Support allure report on CircleCI [(PR #31)](https://github.com/my8100/logparser/issues/31) 7 | 8 | [0.8.3](https://github.com/my8100/logparser/issues?q=is%3Aclosed+milestone%3A0.8.3) (2025-01-01) 9 | ------------------ 10 | - New Features 11 | - Support telnet for Python 3.13 [(PR #29)](https://github.com/my8100/logparser/issues/29) 12 | - Bug Fixes 13 | - Fix log parsing issues and update tests [(issue #26)](https://github.com/my8100/logparser/issues/26) 14 | - Others 15 | - Fix CircleCI config on Python image [(PR #27)](https://github.com/my8100/logparser/issues/27) 16 | 17 | 18 | [0.8.2](https://github.com/my8100/logparser/issues?q=is%3Aclosed+milestone%3A0.8.2) (2019-08-04) 19 | ------------------ 20 | - New Features 21 | - Support telneting with auth for Scrapy>=1.5.2, except for Windows and Fedora 22 | - Improvements 23 | - Add LOG_CATEGORIES_LIMIT option for reducing json file size [(issue #5)](https://github.com/my8100/logparser/issues/5) 24 | - Bug Fixes 25 | - Fix parsing error due to unicode signs in crawler stats [(issue #2)](https://github.com/my8100/logparser/issues/2) 26 | - Stats collected via telnet are not being updated periodically [(issue #4)](https://github.com/my8100/logparser/issues/4) 27 | - Others 28 | - Support continuous integration (CI) on [CircleCI](https://circleci.com/) 29 | 30 | 31 | 0.8.1 (2019-03-12) 32 | ------------------ 33 | - New Features 34 | - Support collecting crawler_stats and crawler_engine via telnet if available 35 | (Note that this feature temporarily only works for [Scrapy 1.5.1](https://doc.scrapy.org/en/latest/news.html#scrapy-1-5-1-2018-07-12) and its earlier version 36 | since telnet console now requires username and password after [Scrapy 1.5.2](https://doc.scrapy.org/en/latest/news.html#release-1-5-2)) 37 | - Improvements 38 | - Set 'pages' and 'items' of the parsing result to None if not available, instead of 0 39 | - Change key of the parsing result from 'elapsed' to 'runtime' 40 | 41 | 42 | 0.8.0 (2019-01-20) 43 | ------------------ 44 | - First release version (compatible with [*ScrapydWeb* v1.1.0](https://github.com/my8100/scrapydweb)) 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include HISTORY.md 2 | global-exclude *.pyc 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LogParser: A tool for parsing Scrapy log files periodically and incrementally, designed for [*ScrapydWeb*](https://github.com/my8100/scrapydweb). 2 | 3 | [![PyPI - logparser Version](https://img.shields.io/pypi/v/logparser.svg)](https://pypi.org/project/logparser/) 4 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/logparser.svg)](https://pypi.org/project/logparser/) 5 | [![CircleCI](https://circleci.com/gh/my8100/logparser/tree/master.svg?style=shield)](https://circleci.com/gh/my8100/logparser/tree/master) 6 | [![codecov](https://codecov.io/gh/my8100/logparser/branch/master/graph/badge.svg)](https://codecov.io/gh/my8100/logparser) 7 | [![Coverage Status](https://coveralls.io/repos/github/my8100/logparser/badge.svg?branch=master)](https://coveralls.io/github/my8100/logparser?branch=master) 8 | [![Downloads - total](https://pepy.tech/badge/logparser)](https://pepy.tech/project/logparser) 9 | [![GitHub license](https://img.shields.io/github/license/my8100/logparser.svg)](https://github.com/my8100/logparser/blob/master/LICENSE) 10 | 11 | 12 | ## Installation 13 | - Use pip: 14 | ```bash 15 | pip install logparser 16 | ``` 17 | :heavy_exclamation_mark: Note that you may need to execute `python -m pip install --upgrade pip` first in order to get the latest version of logparser, or download the tar.gz file from https://pypi.org/project/logparser/#files and get it installed via `pip install logparser-x.x.x.tar.gz` 18 | 19 | - Use git: 20 | ```bash 21 | pip install --upgrade git+https://github.com/my8100/logparser.git 22 | ``` 23 | Or: 24 | ```bash 25 | git clone https://github.com/my8100/logparser.git 26 | cd logparser 27 | python setup.py install 28 | ``` 29 | 30 | ## Usage 31 | ### To use in Python 32 |
33 | View codes 34 | 35 | ```python 36 | In [1]: from logparser import parse 37 | 38 | In [2]: log = """2018-10-23 18:28:34 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: demo) 39 | ...: 2018-10-23 18:29:41 [scrapy.statscollectors] INFO: Dumping Scrapy stats: 40 | ...: {'downloader/exception_count': 3, 41 | ...: 'downloader/exception_type_count/twisted.internet.error.TCPTimedOutError': 3, 42 | ...: 'downloader/request_bytes': 1336, 43 | ...: 'downloader/request_count': 7, 44 | ...: 'downloader/request_method_count/GET': 7, 45 | ...: 'downloader/response_bytes': 1669, 46 | ...: 'downloader/response_count': 4, 47 | ...: 'downloader/response_status_count/200': 2, 48 | ...: 'downloader/response_status_count/302': 1, 49 | ...: 'downloader/response_status_count/404': 1, 50 | ...: 'dupefilter/filtered': 1, 51 | ...: 'finish_reason': 'finished', 52 | ...: 'finish_time': datetime.datetime(2018, 10, 23, 10, 29, 41, 174719), 53 | ...: 'httperror/response_ignored_count': 1, 54 | ...: 'httperror/response_ignored_status_count/404': 1, 55 | ...: 'item_scraped_count': 2, 56 | ...: 'log_count/CRITICAL': 5, 57 | ...: 'log_count/DEBUG': 14, 58 | ...: 'log_count/ERROR': 5, 59 | ...: 'log_count/INFO': 75, 60 | ...: 'log_count/WARNING': 3, 61 | ...: 'offsite/domains': 1, 62 | ...: 'offsite/filtered': 1, 63 | ...: 'request_depth_max': 1, 64 | ...: 'response_received_count': 3, 65 | ...: 'retry/count': 2, 66 | ...: 'retry/max_reached': 1, 67 | ...: 'retry/reason_count/twisted.internet.error.TCPTimedOutError': 2, 68 | ...: 'scheduler/dequeued': 7, 69 | ...: 'scheduler/dequeued/memory': 7, 70 | ...: 'scheduler/enqueued': 7, 71 | ...: 'scheduler/enqueued/memory': 7, 72 | ...: 'start_time': datetime.datetime(2018, 10, 23, 10, 28, 35, 70938)} 73 | ...: 2018-10-23 18:29:42 [scrapy.core.engine] INFO: Spider closed (finished)""" 74 | 75 | In [3]: odict = parse(log, headlines=1, taillines=1) 76 | 77 | In [4]: odict 78 | Out[4]: 79 | OrderedDict([('head', 80 | '2018-10-23 18:28:34 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: demo)'), 81 | ('tail', 82 | '2018-10-23 18:29:42 [scrapy.core.engine] INFO: Spider closed (finished)'), 83 | ('first_log_time', '2018-10-23 18:28:34'), 84 | ('latest_log_time', '2018-10-23 18:29:42'), 85 | ('runtime', '0:01:08'), 86 | ('first_log_timestamp', 1540290514), 87 | ('latest_log_timestamp', 1540290582), 88 | ('datas', []), 89 | ('pages', 3), 90 | ('items', 2), 91 | ('latest_matches', 92 | {'telnet_console': '', 93 | 'resuming_crawl': '', 94 | 'latest_offsite': '', 95 | 'latest_duplicate': '', 96 | 'latest_crawl': '', 97 | 'latest_scrape': '', 98 | 'latest_item': '', 99 | 'latest_stat': ''}), 100 | ('latest_crawl_timestamp', 0), 101 | ('latest_scrape_timestamp', 0), 102 | ('log_categories', 103 | {'critical_logs': {'count': 5, 'details': []}, 104 | 'error_logs': {'count': 5, 'details': []}, 105 | 'warning_logs': {'count': 3, 'details': []}, 106 | 'redirect_logs': {'count': 1, 'details': []}, 107 | 'retry_logs': {'count': 2, 'details': []}, 108 | 'ignore_logs': {'count': 1, 'details': []}}), 109 | ('shutdown_reason', 'N/A'), 110 | ('finish_reason', 'finished'), 111 | ('crawler_stats', 112 | OrderedDict([('source', 'log'), 113 | ('last_update_time', '2018-10-23 18:29:41'), 114 | ('last_update_timestamp', 1540290581), 115 | ('downloader/exception_count', 3), 116 | ('downloader/exception_type_count/twisted.internet.error.TCPTimedOutError', 117 | 3), 118 | ('downloader/request_bytes', 1336), 119 | ('downloader/request_count', 7), 120 | ('downloader/request_method_count/GET', 7), 121 | ('downloader/response_bytes', 1669), 122 | ('downloader/response_count', 4), 123 | ('downloader/response_status_count/200', 2), 124 | ('downloader/response_status_count/302', 1), 125 | ('downloader/response_status_count/404', 1), 126 | ('dupefilter/filtered', 1), 127 | ('finish_reason', 'finished'), 128 | ('finish_time', 129 | 'datetime.datetime(2018, 10, 23, 10, 29, 41, 174719)'), 130 | ('httperror/response_ignored_count', 1), 131 | ('httperror/response_ignored_status_count/404', 1), 132 | ('item_scraped_count', 2), 133 | ('log_count/CRITICAL', 5), 134 | ('log_count/DEBUG', 14), 135 | ('log_count/ERROR', 5), 136 | ('log_count/INFO', 75), 137 | ('log_count/WARNING', 3), 138 | ('offsite/domains', 1), 139 | ('offsite/filtered', 1), 140 | ('request_depth_max', 1), 141 | ('response_received_count', 3), 142 | ('retry/count', 2), 143 | ('retry/max_reached', 1), 144 | ('retry/reason_count/twisted.internet.error.TCPTimedOutError', 145 | 2), 146 | ('scheduler/dequeued', 7), 147 | ('scheduler/dequeued/memory', 7), 148 | ('scheduler/enqueued', 7), 149 | ('scheduler/enqueued/memory', 7), 150 | ('start_time', 151 | 'datetime.datetime(2018, 10, 23, 10, 28, 35, 70938)')])), 152 | ('last_update_time', '2019-03-08 16:53:50'), 153 | ('last_update_timestamp', 1552035230), 154 | ('logparser_version', '0.8.1')]) 155 | 156 | In [5]: odict['runtime'] 157 | Out[5]: '0:01:08' 158 | 159 | In [6]: odict['pages'] 160 | Out[6]: 3 161 | 162 | In [7]: odict['items'] 163 | Out[7]: 2 164 | 165 | In [8]: odict['finish_reason'] 166 | Out[8]: 'finished' 167 | ``` 168 | 169 |
170 | 171 | ### To run as a service 172 | 1. **Make sure that [*Scrapyd*](https://github.com/scrapy/scrapyd) has been installed and started on the current host.** 173 | 2. Start ***LogParser*** via command `logparser` 174 | 3. Visit http://127.0.0.1:6800/logs/stats.json **(Assuming the Scrapyd service runs on port 6800.)** 175 | 4. Visit http://127.0.0.1:6800/logs/projectname/spidername/jobid.json to get stats of a job in details. 176 | 177 | ### To work with *ScrapydWeb* for visualization 178 | Check out https://github.com/my8100/scrapydweb for more info. 179 | 180 | ![stats](https://raw.githubusercontent.com/my8100/files/master/scrapydweb/screenshots/stats.gif) 181 | -------------------------------------------------------------------------------- /logparser/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import logging 3 | 4 | from .__version__ import __version__ 5 | from .common import SETTINGS_PY_PATH 6 | # from .logparser import LogParser 7 | from .scrapylogparser import parse 8 | 9 | 10 | # https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library 11 | # https://docs.python-guide.org/writing/logging/#logging-in-a-library 12 | logging.getLogger(__name__).addHandler(logging.NullHandler()) 13 | -------------------------------------------------------------------------------- /logparser/__version__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | __title__ = 'logparser' 4 | __version__ = '0.8.4' 5 | __author__ = 'my8100' 6 | __author_email__ = 'my8100@gmail.com' 7 | __url__ = 'https://github.com/my8100/logparser' 8 | __license__ = 'GNU General Public License v3.0' 9 | __description__ = "A tool for parsing Scrapy log files periodically and incrementally, designed for ScrapydWeb." 10 | -------------------------------------------------------------------------------- /logparser/common.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from collections import OrderedDict 3 | import json 4 | from datetime import datetime 5 | import os 6 | import platform 7 | import sys 8 | import re 9 | import time 10 | import traceback 11 | 12 | 13 | CWD = os.path.dirname(os.path.abspath(__file__)) 14 | SETTINGS_PY_PATH = os.path.join(CWD, 'settings.py') 15 | 16 | # LINESEP_PATTERN = re.compile(r'%s' % os.linesep) 17 | LINESEP_PATTERN = re.compile(r'\r\n|\n|\r') 18 | LINESEP_BULK_PATTERN = re.compile(r'(?:\r\n|\n|\r)\s*') # \s includes \t\r\n\f\v 19 | 20 | # 2019-01-01 00:00:01 21 | DATETIME_PATTERN = r'\d{4}-\d{2}-\d{2}[ ]\d{2}:\d{2}:\d{2}' # would be ignore with re.VERBOSE, use [ ] instead 22 | 23 | # 2019-01-01 00:00:01 [scrapy.extensions.logstats] INFO: 24 | # Crawled 2318 pages (at 2 pages/min), scraped 68438 items (at 60 items/min) 25 | DATAS_PATTERN = re.compile(r"""\n 26 | (?P%s)[ ].+? 27 | Crawled[ ](?P\d+)[ ]pages[ ]\(at[ ](?P\d+)[ ]pages/min\) 28 | ,[ ]scraped[ ](?P\d+)[ ]items[ ]\(at[ ](?P\d+)[ ]items/min\) 29 | """ % DATETIME_PATTERN, re.VERBOSE) 30 | 31 | LOG_CATEGORIES_PATTERN_DICT = dict( 32 | critical_logs=r'\][ ]CRITICAL:', # [test] CRITICAL: 33 | error_logs=r'\][ ]ERROR:', # [test] ERROR: 34 | warning_logs=r'\][ ]WARNING:', # [test] WARNING: 35 | redirect_logs=r':[ ]Redirecting[ ]\(', # DEBUG: Redirecting (302) to 63 | latest_crawl=r'Crawled[ ]\(\d+\)', # Crawled (200) (referer: None) 64 | # latest_scrape=r'Scraped[ ]from[ ]<', # Scraped from <200 http://httpbin.org/headers> 65 | # latest_item=r'^\{.+\}', # {'item': 1} TODO: multilines item 66 | latest_stat=r'Crawled[ ]\d+[ ]pages[ ]\(at' # Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 67 | ) 68 | _odict = OrderedDict() 69 | for k in ['scrapy_version', 'telnet_console', 'telnet_username', 'telnet_password', 'resuming_crawl', 70 | 'latest_offsite', 'latest_duplicate', 'latest_crawl', 'latest_stat']: 71 | _odict.update({k: LATEST_MATCHES_PATTERN_DICT[k]}) 72 | LATEST_MATCHES_PATTERN_DICT = _odict 73 | for k, v in LATEST_MATCHES_PATTERN_DICT.items(): 74 | if k not in ['telnet_username', 'telnet_password']: 75 | LATEST_MATCHES_PATTERN_DICT[k] = r'^%s[ ].+?%s' % (DATETIME_PATTERN, v) 76 | 77 | # 2019-01-01 00:00:01 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/headers> 78 | LATEST_SCRAPE_ITEM_PATTERN = re.compile(r"""\n 79 | ({time_}[ ][^\n]+?{pattern}[^\n]+?)\r?\n({{.*?) 80 | (?=\r?\n{time_}[ ][^\n]+?(?:DEBUG|INFO|WARNING|ERROR|CRITICAL)) # ?=: 81 | """.format(time_=DATETIME_PATTERN, pattern=r':[ ]Scraped[ ]from[ ]<'), 82 | re.VERBOSE | re.DOTALL) 83 | 84 | # 2019-01-01 00:00:01 [scrapy.crawler] INFO: Received SIGTERM, shutting down gracefully. Send again to force 85 | # 2019-01-01 00:00:01 [scrapy.core.engine] INFO: Closing spider (shutdown) 86 | # 2019-01-01 00:00:01 [scrapy.crawler] INFO: Received SIGTERM twice, forcing unclean shutdown 87 | SIGTERM_PATTERN = re.compile(r'^%s[ ].+?:[ ](Received[ ]SIG(?:BREAK|INT|TERM)([ ]twice)?),' % DATETIME_PATTERN) 88 | 89 | # 'downloader/response_status_count/200': 2, 90 | # 200 301 302 401 403 404 500 503 91 | RESPONSE_STATUS_PATTERN = re.compile(r"'downloader/response_status_count/\d{3}':[ ](?P\d+),") 92 | RESPONSE_STATUS_REDIRECT_PATTERN = re.compile(r"'downloader/response_status_count/3\d{2}':[ ](?P\d+),") 93 | 94 | STATS_DUMPED_CATEGORIES_DICT = dict( 95 | critical_logs='log_count/CRITICAL', 96 | error_logs='log_count/ERROR', 97 | warning_logs='log_count/WARNING', 98 | # redirect_logs= , 99 | retry_logs='retry/count', 100 | ignore_logs='httperror/response_ignored_count', 101 | ) 102 | 103 | # https://github.com/stummjr/scrapy-fieldstats -> fields_coverage in stats 104 | # 2019-01-01 00:00:01 [scrapy_fieldstats.fieldstats] INFO: Field stats: 105 | # {u'Chinese \u6c49\u5b57 1': '50%', u'Chinese \u6c49\u5b57 2': '50%'} 106 | # 2019-01-01 00:00:01 [scrapy_fieldstats.fieldstats] INFO: Field stats: 107 | # { 108 | # 'author': { 109 | # 'name': '100.0%', 110 | # 'age': '52.0%' 111 | # }, 112 | # 'image': '97.0%', 113 | # 'title': '100.0%', 114 | # 'price': '92.0%', 115 | # 'stars': '47.5%' 116 | # } 117 | # 2019-01-01 00:00:01 [scrapy.core.engine] INFO: Closing spider (finished) 118 | # 2019-01-01 00:00:01 [scrapy.statscollectors] INFO: Dumping Scrapy stats: 119 | # {'downloader/exception_count': 3, 120 | # 'dupefilter/filtered': 1, 121 | # 'fields_coverage': {u'Chinese \u6c49\u5b57 1': '50%', 122 | # u'Chinese \u6c49\u5b57 2': '50%'}, 123 | # 'finish_reason': 'finished', 124 | # } 125 | # 2019-01-01 00:00:01 [scrapy.core.engine] INFO: Spider closed (finished) 126 | PATTERN_LOG_ENDING = re.compile(r""" 127 | (%s)[ ][^\n]+? 128 | (Dumping[ ]Scrapy[ ]stats:.*?(\{.+\}).*? 129 | |INFO:[ ]Spider[ ]closed.*) 130 | """ % DATETIME_PATTERN, re.VERBOSE | re.DOTALL) 131 | 132 | 133 | class Common(object): 134 | NA = 'N/A' 135 | 136 | LINESEP_PATTERN = LINESEP_PATTERN 137 | LINESEP_BULK_PATTERN = LINESEP_BULK_PATTERN 138 | 139 | DATETIME_PATTERN = DATETIME_PATTERN 140 | DATAS_PATTERN = DATAS_PATTERN 141 | LOG_CATEGORIES_PATTERN_DICT = LOG_CATEGORIES_PATTERN_DICT 142 | LATEST_MATCHES_PATTERN_DICT = LATEST_MATCHES_PATTERN_DICT 143 | LATEST_SCRAPE_ITEM_PATTERN = LATEST_SCRAPE_ITEM_PATTERN 144 | 145 | SIGTERM_PATTERN = SIGTERM_PATTERN 146 | RESPONSE_STATUS_PATTERN = RESPONSE_STATUS_PATTERN 147 | RESPONSE_STATUS_REDIRECT_PATTERN = RESPONSE_STATUS_REDIRECT_PATTERN 148 | STATS_DUMPED_CATEGORIES_DICT = STATS_DUMPED_CATEGORIES_DICT 149 | PATTERN_LOG_ENDING = PATTERN_LOG_ENDING 150 | 151 | CWD = CWD 152 | ON_WINDOWS = platform.system() == 'Windows' 153 | PY2 = sys.version_info.major < 3 154 | SETTINGS_PY_PATH = SETTINGS_PY_PATH 155 | 156 | @staticmethod 157 | def get_current_time_timestamp(): 158 | current_timestamp = int(time.time()) 159 | current_time = datetime.fromtimestamp(current_timestamp).strftime('%Y-%m-%d %H:%M:%S') 160 | return current_time, current_timestamp 161 | 162 | @staticmethod 163 | def parse_log_path(log_path): 164 | project, spider, _job = log_path.split(os.sep)[-3:] 165 | job, ext = os.path.splitext(_job) # ('job', '') or ('job', '.log') 166 | return project, spider, job, ext 167 | 168 | def get_ordered_dict(self, adict, source): 169 | odict = OrderedDict(source=source) 170 | odict['last_update_time'], odict['last_update_timestamp'] = self.get_current_time_timestamp() 171 | for key in sorted(adict.keys()): 172 | odict[key] = adict[key] 173 | return odict 174 | 175 | @staticmethod 176 | def parse_crawler_stats(text): 177 | # 'start_time': datetime.datetime(2019, 3, 9, 13, 55, 24, 601697) 178 | # "robotstxt/exception_count/": 1, 179 | # scrapy-crawlera/scrapy_crawlera/middleware.py: 180 | # self.crawler.stats.inc_value( 181 | # 'crawlera/response/error/%s' % crawlera_error.decode('utf8')) 182 | # u"crawlera/response/error/timeout": 1 183 | # 'items_per_minute': None, 184 | # 'responses_per_minute': None, 185 | backup = text 186 | text = re.sub(r'(datetime.datetime\(.+?\))', r'"\1"', text) 187 | text = re.sub(r'(".*?)\'(.*?)\'(.*?")', r'\1_\2_\3', text) 188 | text = re.sub(r"'(.+?)'", r'"\1"', text) 189 | text = re.sub(r'[bu]"(.+?)"', r'"\1"', text) 190 | text = re.sub(r': None([,}])', r': null\1', text) 191 | try: 192 | return json.loads(text) 193 | except ValueError as err: 194 | print(text) 195 | print(traceback.format_exc()) 196 | # str(err) to avoid TypeError: Object of type JSONDecodeError is not JSON serializable 197 | return dict(json_loads_error=str(err), stats=backup) 198 | 199 | def update_data_with_crawler_stats(self, data, crawler_stats, update_log_count): 200 | # 'downloader/response_count': 4, 201 | # 'downloader/response_status_count/200': 2, 202 | # 'downloader/response_status_count/302': 1, 203 | # 'downloader/response_status_count/404': 1, 204 | # 'finish_reason': 'closespider_timeout', 205 | # 'item_scraped_count': 2, 206 | # 'response_received_count': 3, 207 | data['finish_reason'] = crawler_stats.get('finish_reason', data['finish_reason']) 208 | data['pages'] = crawler_stats.get('response_received_count', data['pages']) 209 | data['items'] = crawler_stats.get('item_scraped_count', data['items']) 210 | 211 | if not update_log_count: 212 | return 213 | redirect_count = 0 214 | for key, value in crawler_stats.items(): 215 | if key.startswith('downloader/response_status_count/3'): 216 | redirect_count += value 217 | if redirect_count > 0: 218 | data['log_categories']['redirect_logs']['count'] = redirect_count 219 | 220 | for level, key in self.STATS_DUMPED_CATEGORIES_DICT.items(): 221 | count = crawler_stats.get(key, 0) 222 | if count > 0: 223 | data['log_categories'][level]['count'] = count 224 | -------------------------------------------------------------------------------- /logparser/logparser.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from collections import OrderedDict 3 | from datetime import datetime 4 | import glob 5 | import io 6 | import json 7 | import logging 8 | import os 9 | import re 10 | import sys 11 | import time 12 | import traceback 13 | 14 | try: 15 | from psutil import pid_exists 16 | except ImportError: 17 | pid_exists = None 18 | 19 | from .__version__ import __version__ 20 | from .common import Common 21 | from .scrapylogparser import parse 22 | from .telnet import MyTelnet 23 | from .utils import custom_settings, get_logger 24 | 25 | 26 | logger = get_logger(__name__) 27 | 28 | SIMPLIFIED_KEYS = [ 29 | 'log_path', 30 | 'json_path', 31 | 'json_url', 32 | 'size', 33 | 'position', 34 | 'status', 35 | '_head', 36 | 37 | 'pages', 38 | 'items', 39 | 'first_log_time', 40 | 'latest_log_time', 41 | 'runtime', 42 | 'shutdown_reason', 43 | 'finish_reason', 44 | 'last_update_time' 45 | ] 46 | 47 | 48 | # noinspection PyBroadException 49 | class LogParser(Common): 50 | # datas = {} # Cause shared self.datas between test functions! 51 | logger = logger 52 | 53 | def __init__(self, scrapyd_server, scrapyd_logs_dir, parse_round_interval, 54 | enable_telnet, override_telnet_console_host, log_encoding, log_extensions, 55 | log_head_lines, log_tail_lines, log_categories_limit, jobs_to_keep, chunk_size, 56 | delete_existing_json_files_at_startup, keep_data_in_memory, verbose, 57 | main_pid=0, debug=False, exit_timeout=0): 58 | self.SCRAPYD_SERVER = scrapyd_server 59 | self.SCRAPYD_LOGS_DIR = scrapyd_logs_dir 60 | self.PARSE_ROUND_INTERVAL = parse_round_interval 61 | self.ENABLE_TELNET = enable_telnet 62 | 63 | self.OVERRIDE_TELNET_CONSOLE_HOST = override_telnet_console_host 64 | self.LOG_ENCODING = log_encoding 65 | self.LOG_EXTENSIONS = log_extensions 66 | self.LOG_HEAD_LINES = log_head_lines 67 | self.LOG_TAIL_LINES = log_tail_lines 68 | self.LOG_CATEGORIES_LIMIT = log_categories_limit 69 | self.JOBS_TO_KEEP = jobs_to_keep 70 | self.CHUNK_SIZE = chunk_size 71 | self.DELETE_EXISTING_JSON_FILES_AT_STARTUP = delete_existing_json_files_at_startup 72 | self.KEEP_DATA_IN_MEMORY = keep_data_in_memory 73 | 74 | self.verbose = verbose 75 | if self.verbose: 76 | self.logger.setLevel(logging.DEBUG) 77 | else: 78 | self.logger.setLevel(logging.INFO) 79 | self.DEBUG = debug 80 | self.EXIT_TIMEOUT = exit_timeout 81 | 82 | self.main_pid = main_pid 83 | self.logparser_pid = os.getpid() 84 | 85 | # TypeError: Object of type set is not JSON serializable 86 | self.logger.debug(self.json_dumps(vars(self))) 87 | 88 | self.stats_json_path = os.path.join(self.SCRAPYD_LOGS_DIR, 'stats.json') 89 | self.stats_json_url = 'http://%s/logs/stats.json' % self.SCRAPYD_SERVER 90 | self.logparser_version = __version__ 91 | self.init_time = time.time() 92 | self.log_paths = [] 93 | self.existing_file_keys = set() 94 | self.datas = {} 95 | 96 | if self.DELETE_EXISTING_JSON_FILES_AT_STARTUP: 97 | self.delete_existing_results() 98 | 99 | if not os.path.exists(self.stats_json_path): 100 | self.save_text_into_logs_dir('stats.json', self.json_dumps(self.get_default_stats())) 101 | 102 | def calc_runtime(self, start_string, end_string): 103 | try: 104 | start_datetime = datetime.strptime(start_string, '%Y-%m-%d %H:%M:%S') 105 | end_datetime = datetime.strptime(end_string, '%Y-%m-%d %H:%M:%S') 106 | except (TypeError, ValueError): # 0 or '' 107 | return self.NA 108 | else: 109 | return str(end_datetime - start_datetime) 110 | 111 | # REF: /scrapydweb/scrapydweb/utils/poll.py 112 | def check_exit(self): 113 | exit_condition_1 = pid_exists is not None and not pid_exists(self.main_pid) 114 | exit_condition_2 = not self.ON_WINDOWS and not self.check_pid(self.main_pid) 115 | if exit_condition_1 or exit_condition_2: 116 | sys.exit("!!! LogParser subprocess (pid: %s) exits " 117 | "since main_pid %s not exists" % (self.logparser_pid, self.main_pid)) 118 | 119 | @staticmethod 120 | def check_pid(pid): 121 | """ Check For the existence of a unix pid. """ 122 | try: 123 | os.kill(pid, 0) 124 | except OSError: 125 | return False 126 | else: 127 | return True 128 | 129 | def count_actual_lines(self, text): 130 | return len(re.split(self.LINESEP_BULK_PATTERN, text)) 131 | 132 | def cut_text(self, text, lines_limit, keep_head=True): 133 | count = 0 134 | lines = [] 135 | for line in re.split(self.LINESEP_PATTERN, text)[::1 if keep_head else -1]: 136 | lines.append(line) 137 | if line.strip(): 138 | count += 1 139 | if count == lines_limit: 140 | break 141 | return '\n'.join(lines[::1 if keep_head else -1]) 142 | 143 | def delete_existing_results(self): 144 | for path in glob.glob(os.path.join(self.SCRAPYD_LOGS_DIR, '*/*/*.json')): 145 | try: 146 | os.remove(path) 147 | except Exception as err: 148 | self.logger.error(err) 149 | else: 150 | self.logger.warning("Deleted %s", path) 151 | 152 | def find_text_to_ignore(self, text): 153 | lines = re.split(r'\n', text) # KEEP the same '\n' 154 | m = re.search(self.PATTERN_LOG_ENDING, text) 155 | if m: 156 | self.logger.info("Found log ending:\n%s", self.format_log_block('log ending', m.group())) 157 | text_to_ignore = '' 158 | else: 159 | # To ensure the integrity of a log with multilines, e.g. error with traceback info, 160 | # the tail of the appended_log must be ignored 161 | # 2019-01-01 00:00:01 [test] WARNING: warning # Would be parsed in this round 162 | # 123abc # Would be parsed in this round 163 | # ------------------------------------------------------------------------- 164 | # 2019-01-01 00:00:01 [test] ERROR: error # Would be ignored for next round 165 | # 456abc # Would be ignored for next round 166 | if len(re.findall(self.DATETIME_PATTERN + r'[ ].+?\n', text)) < 2: 167 | text_to_ignore = text 168 | self.logger.debug("Skip short appended log for next round: %s", repr(text_to_ignore)) 169 | else: 170 | lines_to_ignore = [] 171 | for line in lines[::-1]: 172 | lines_to_ignore.append(line) 173 | if re.match(self.DATETIME_PATTERN, line): 174 | break 175 | text_to_ignore = '\n'.join(lines_to_ignore[::-1]) # KEEP the same '\n' 176 | self.logger.debug("Text to be ignored for next round: %s", repr(text_to_ignore)) 177 | 178 | return text_to_ignore 179 | 180 | @staticmethod 181 | def format_log_block(title, log, lines_limit=0): 182 | if lines_limit: 183 | lines = re.split(r'\n', log) 184 | half = max(1, int(lines_limit / 2)) 185 | if len(lines) > lines_limit: 186 | log = '\n'.join(lines[:half] + ['......'] + lines[-half:]) 187 | return u'\n{title}:\n{sign}\n{log}\n{sign}\n'.format(title=title, log=log, sign='=' * 150) 188 | 189 | def get_default_stats(self): 190 | last_update_timestamp = int(time.time()) 191 | last_update_time = datetime.fromtimestamp(last_update_timestamp).strftime('%Y-%m-%d %H:%M:%S') 192 | return dict(status='ok', datas={}, 193 | settings_py=self.handle_slash(self.SETTINGS_PY_PATH), settings=custom_settings, 194 | last_update_timestamp=last_update_timestamp, last_update_time=last_update_time, 195 | logparser_version=self.logparser_version) 196 | 197 | def handle_logfile(self, log_path): 198 | self.logger.debug("log_path: %s", log_path) 199 | project, spider, job, ext = self.parse_log_path(log_path) 200 | self.existing_file_keys.add(log_path) 201 | 202 | # The last modification time of Scrapy log does not change over time?! 203 | # mtime = os.path.getmtime(log_path) 204 | # datetime.fromtimestamp(os.path.getmtime(log_path)).strftime('%Y-%m-%dT%H_%M_%S') 205 | size = os.path.getsize(log_path) 206 | 207 | if log_path not in self.datas: 208 | self.logger.info("New logfile found: %s (%s bytes)", log_path, size) 209 | json_path = os.path.join(self.SCRAPYD_LOGS_DIR, project, spider, '%s.json' % job) 210 | json_url = 'http://%s/logs/%s/%s/%s.json' % (self.SCRAPYD_SERVER, project, spider, job) 211 | # NOTE: do not use handle_slash() on log_path since parse_log_path() use os.sep 212 | data = OrderedDict(log_path=log_path, json_path=json_path, json_url=json_url, 213 | size=size, position=0, status='ok', _head='') 214 | self.datas[log_path] = data 215 | 216 | loaded_data = self.read_data(json_path) 217 | if loaded_data.get('size', -1) == size: 218 | data.update(loaded_data) # AVOID using data = 219 | self.logger.info("New logfile and its data with same size found: %s (size: %s) -> skip", 220 | json_path, loaded_data['size']) 221 | return 222 | else: 223 | self.logger.info("New logfile: %s (%s bytes) -> parse", log_path, size) 224 | elif size < self.datas[log_path]['size']: 225 | self.logger.warning("Old logfile with smaller size: %s (before: %s, now: %s bytes) -> parse in next round", 226 | log_path, self.datas[log_path]['size'], size) 227 | self.datas.pop(log_path) 228 | return 229 | elif size == self.datas[log_path]['size']: 230 | self.logger.debug("Old logfile with old size: %s (%s bytes) -> skip", log_path, size) 231 | return 232 | else: 233 | self.logger.info("Old logfile with new size: %s (%s bytes) -> parse", log_path, size) 234 | data = self.datas[log_path] 235 | 236 | if not self.KEEP_DATA_IN_MEMORY: 237 | # If the json file is broken, the logfile should be reparsed from position 0 238 | loaded_data = self.read_data(data['json_path']) 239 | if (loaded_data.get('size', -1) == data['size'] 240 | and loaded_data.get('position', -1) == data['position']): 241 | data.update(loaded_data) 242 | else: 243 | self.logger.warning("The logfile would be parsed from position 0: %s", log_path) 244 | data['position'] = 0 245 | data.pop('first_log_time', None) # See parse_appended_log() 246 | data['size'] = size 247 | 248 | # f.read(1000000) => f.tell() 15868 # safe 249 | # f.seek(1000000) => f.tell() 1000000 # unexpected 250 | # Add max() for logfile with 0 size 251 | for __ in range(data['position'], max(1, data['size']), self.CHUNK_SIZE): 252 | self.logger.debug("Remaining size to be read: %s bytes", data['size'] - data['position']) 253 | appended_log = self.read_appended_log(data, size=self.CHUNK_SIZE) 254 | if self.DEBUG: 255 | self.save_text_into_logs_dir('appended_log.log', appended_log) 256 | self.parse_appended_log(data, appended_log) 257 | 258 | return data 259 | 260 | @staticmethod 261 | def handle_slash(string): 262 | if not string: 263 | return string 264 | else: 265 | return string.replace('\\', '/') 266 | 267 | def handle_telnet(self, data): 268 | data.setdefault('crawler_engine', {}) 269 | if (self.ENABLE_TELNET 270 | and data['latest_matches']['telnet_console'] 271 | and data['crawler_stats'].get('source') != 'log'): # Do not telnet when the job is finished 272 | mytelnet = MyTelnet(data, self.OVERRIDE_TELNET_CONSOLE_HOST, self.verbose) 273 | crawler_stats, crawler_engine = mytelnet.main() 274 | if crawler_stats: 275 | # update_log_count=False to avoid wrong count in parse_appended_log() when the job is running 276 | self.update_data_with_crawler_stats(data, crawler_stats, update_log_count=False) 277 | 278 | data['crawler_stats'] = crawler_stats or data['crawler_stats'] 279 | data['crawler_engine'] = crawler_engine or data['crawler_engine'] 280 | self.logger.debug("crawler_stats:\n%s", self.json_dumps(data['crawler_stats'])) 281 | self.logger.debug("crawler_engine:\n%s", self.json_dumps(data['crawler_engine'])) 282 | 283 | @staticmethod 284 | def json_dumps(obj, sort_keys=False): 285 | return json.dumps(obj, ensure_ascii=False, indent=4, sort_keys=sort_keys) 286 | 287 | def main(self): 288 | while True: 289 | if self.main_pid: 290 | self.check_exit() 291 | start_time = time.time() 292 | try: 293 | self.run() 294 | end_time = time.time() 295 | self.logger.debug("Took %.1f seconds in this round", (end_time - start_time)) 296 | if 0 < self.EXIT_TIMEOUT < end_time - self.init_time: 297 | self.logger.critical("GoodBye, EXIT_TIMEOUT: %s", self.EXIT_TIMEOUT) 298 | break 299 | else: 300 | self.logger.info("Sleeping for %ss", self.PARSE_ROUND_INTERVAL) 301 | time.sleep(self.PARSE_ROUND_INTERVAL) 302 | except KeyboardInterrupt: 303 | if self.main_pid: 304 | self.logger.warning("LogParser subprocess (pid: %s) cancelled by KeyboardInterrupt", 305 | self.logparser_pid) 306 | else: 307 | self.logger.warning("KeyboardInterrupt") 308 | sys.exit() 309 | except: 310 | self.logger.error(traceback.format_exc()) 311 | 312 | def parse_appended_log(self, data, appended_log): 313 | tail_backup = data.get('tail', '') 314 | # Note that appended_log may be an empty string 315 | data_ = parse(appended_log, self.LOG_HEAD_LINES, self.LOG_TAIL_LINES) 316 | self.logger.debug("Parsed data_ from appended_log:\n%s", self.json_dumps(data_)) 317 | 318 | if 'first_log_time' not in data: 319 | # To keep the order of keys in Python 2 320 | for k, v in data_.items(): 321 | data[k] = v 322 | else: 323 | # data['head'] would be updated below 324 | data['tail'] = data_['tail'] 325 | 326 | if data['first_log_time'] == self.NA: 327 | data['first_log_time'] = data_['first_log_time'] 328 | data['first_log_timestamp'] = data_['first_log_timestamp'] 329 | if data_['latest_log_time'] != self.NA: 330 | data['latest_log_time'] = data_['latest_log_time'] 331 | data['latest_log_timestamp'] = data_['latest_log_timestamp'] 332 | data['runtime'] = self.calc_runtime(data['first_log_time'], data['latest_log_time']) 333 | 334 | data['datas'].extend(data_['datas']) 335 | for k in ['pages', 'items']: 336 | if data[k] is None: 337 | data[k] = data_[k] 338 | elif data_[k] is not None: 339 | data[k] = max(data[k], data_[k]) 340 | 341 | for k, v in data_['latest_matches'].items(): 342 | data['latest_matches'][k] = v or data['latest_matches'][k] 343 | # latest_crawl_timestamp, latest_scrape_timestamp 344 | for k in ['latest_crawl', 'latest_scrape']: 345 | if data_['latest_matches'][k]: 346 | data['%s_timestamp' % k] = data_['%s_timestamp' % k] 347 | 348 | # "log_categories": {"critical_logs": {"count": 0, "details": []}} 349 | for k, v in data_['log_categories'].items(): 350 | if v['count'] > 0: 351 | if data_['finish_reason'] != self.NA: 352 | data['log_categories'][k]['count'] = v['count'] 353 | else: 354 | data['log_categories'][k]['count'] += v['count'] 355 | data['log_categories'][k]['details'].extend(v['details']) 356 | 357 | for k in ['shutdown_reason', 'finish_reason']: 358 | if data_[k] != self.NA: 359 | data[k] = data_[k] 360 | data['crawler_stats'] = data_['crawler_stats'] or data['crawler_stats'] 361 | data['last_update_timestamp'] = data_['last_update_timestamp'] 362 | data['last_update_time'] = data_['last_update_time'] 363 | 364 | # To ensure the actual length of headlines and taillines 365 | if data['_head'] != self.LOG_HEAD_LINES: 366 | if data['_head']: 367 | if appended_log: 368 | data['head'] = '%s\n%s' % (data['_head'], appended_log) 369 | else: # appended_log would be empty string for short appended log 370 | data['head'] = data['_head'] 371 | else: 372 | data['head'] = appended_log 373 | data['head'] = self.cut_text(data['head'], self.LOG_HEAD_LINES) 374 | if self.count_actual_lines(data['head']) < self.LOG_HEAD_LINES: 375 | data['_head'] = data['head'] 376 | else: 377 | data['_head'] = self.LOG_HEAD_LINES 378 | 379 | if self.count_actual_lines(data['tail']) < self.LOG_TAIL_LINES: 380 | if tail_backup: 381 | if appended_log: 382 | data['tail'] = '%s\n%s' % (tail_backup, appended_log) 383 | else: 384 | data['tail'] = tail_backup 385 | else: 386 | data['tail'] = appended_log 387 | data['tail'] = self.cut_text(data['tail'], self.LOG_TAIL_LINES, keep_head=False) 388 | 389 | # TO limit each item e.g. critical_logs in log_categories 390 | # "log_categories": {"critical_logs": {"count": 0, "details": []}} 391 | for k, v in data['log_categories'].items(): 392 | v.update(details=v['details'][-self.LOG_CATEGORIES_LIMIT:]) 393 | 394 | self.logger.info("crawled_pages: %s, scraped_items: %s", data['pages'], data['items']) 395 | 396 | def read_appended_log(self, data, size=-1, backoff_times=10): 397 | # If the argument size is omitted, None, or negative, reads and returns all data until EOF. 398 | # https://stackoverflow.com/a/21533561/10517783 399 | # In text files (those opened without a b in the mode string), 400 | # only seeks relative to the beginning of the file are allowed 401 | # b'\x80abc'.decode('utf-8') 402 | # UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte 403 | size_backup = size 404 | text = '' 405 | with io.open(data['log_path'], 'rb') as f: 406 | f.seek(data['position']) 407 | for count in range(1, backoff_times + 1): 408 | try: 409 | text = f.read(size).decode(self.LOG_ENCODING, 'strict') 410 | except UnicodeDecodeError as err: 411 | self.logger.error(err) 412 | if count == backoff_times: 413 | self.logger.critical("Use f.read().decode(%s, 'replace') instead.", self.LOG_ENCODING) 414 | f.seek(data['position']) 415 | text = f.read(size_backup).decode(self.LOG_ENCODING, 'replace') 416 | else: 417 | # A backoff of 1 byte every time 418 | size = f.tell() - data['position'] - 1 419 | if size > 0: 420 | f.seek(data['position']) 421 | self.logger.warning("Fail %s times, backoff to %s and read %s bytes", count, f.tell(), size) 422 | else: 423 | break 424 | current_stream_position = f.tell() 425 | 426 | text_to_ignore = self.find_text_to_ignore(text) 427 | if text_to_ignore == text: 428 | return '' 429 | else: 430 | data['position'] = current_stream_position - len(text_to_ignore.encode(self.LOG_ENCODING)) 431 | appended_log = text[:-len(text_to_ignore)] if text_to_ignore else text 432 | self.logger.debug("Found appended log:\n%s", 433 | self.format_log_block('appended log', appended_log, lines_limit=10)) 434 | return appended_log 435 | 436 | def read_data(self, json_path): 437 | data = {} 438 | self.logger.debug("Try to load json file: %s", json_path) 439 | if not os.path.exists(json_path): 440 | self.logger.warning("Json file not found: %s", json_path) 441 | else: 442 | try: 443 | with io.open(json_path, 'r', encoding='utf-8') as f: 444 | data = json.loads(f.read(), object_pairs_hook=OrderedDict) 445 | except Exception as err: 446 | self.logger.error(err) 447 | else: 448 | self.logger.debug("Loaded json file: %s", json_path) 449 | logparser_version = data.get('logparser_version', '') 450 | if logparser_version != __version__: 451 | data = {} 452 | self.logger.warning("Ignore json file for mismatching version : %s", logparser_version) 453 | return data 454 | 455 | def run(self): 456 | self.log_paths = [] 457 | for ext in self.LOG_EXTENSIONS: 458 | self.log_paths.extend(glob.glob(os.path.join(self.SCRAPYD_LOGS_DIR, '*/*/*%s' % ext))) 459 | if not self.log_paths: 460 | self.logger.error("No logfiles found in %s/*/*/, check and update " 461 | "the `SCRAPYD_LOGS_DIR` option in %s", self.SCRAPYD_LOGS_DIR, self.SETTINGS_PY_PATH) 462 | else: 463 | self.logger.info("Found %s logfiles", len(self.log_paths)) 464 | 465 | self.existing_file_keys = set() 466 | for log_path in self.log_paths: 467 | try: 468 | data = self.handle_logfile(log_path) 469 | if not data: 470 | continue 471 | self.handle_telnet(data) 472 | self.save_data(data) 473 | except: 474 | self.logger.error(traceback.format_exc()) 475 | self.logger.warning("Pop %s from self.datas", log_path) 476 | self.datas.pop(log_path, None) 477 | 478 | if self.DEBUG: 479 | self.save_text_into_logs_dir('datas_complete.json', self.json_dumps(self.datas)) 480 | self.simplify_datas_in_memory() 481 | if self.DEBUG: 482 | self.save_text_into_logs_dir('datas_simplified.json', self.json_dumps(self.datas)) 483 | self.save_datas() 484 | 485 | def save_data(self, data): 486 | with io.open(data['json_path'], 'wb') as f: 487 | f.write(self.json_dumps(data).encode('utf-8', 'replace')) 488 | self.logger.info("Saved to %s", data['json_path']) 489 | 490 | def save_datas(self): 491 | stats = self.get_default_stats() 492 | for log_path, data in self.datas.items(): 493 | if self.KEEP_DATA_IN_MEMORY and log_path in self.existing_file_keys: 494 | data = self.simplify_data(dict(data)) 495 | else: 496 | data = dict(data) 497 | data.pop('_head') # To simplify data for 'List Stats' in the Overview page 498 | project, spider, job, ext = self.parse_log_path(log_path) 499 | stats['datas'].setdefault(project, {}) 500 | stats['datas'][project].setdefault(spider, {}) 501 | stats['datas'][project][spider][job] = data 502 | text = self.json_dumps(stats) 503 | self.logger.debug("stats.json:\n%s", text) 504 | self.save_text_into_logs_dir('stats.json', text) 505 | 506 | def save_text_into_logs_dir(self, filename, text): 507 | path = os.path.join(self.SCRAPYD_LOGS_DIR, filename) 508 | with io.open(path, 'wb') as f: 509 | content = text.encode('utf-8', 'replace') 510 | f.write(content) 511 | if filename == 'stats.json': 512 | self.logger.info("Saved to %s (%s bytes). Visit stats at: %s", self.stats_json_path, 513 | len(content), self.stats_json_url) 514 | else: 515 | self.logger.info("Saved to %s (%s bytes)", filename, len(content)) 516 | 517 | @staticmethod 518 | def simplify_data(data): 519 | data_ = OrderedDict() 520 | for k in SIMPLIFIED_KEYS: 521 | data_[k] = data[k] 522 | return data_ 523 | 524 | def simplify_datas_in_memory(self): 525 | all_keys = set(self.datas.keys()) 526 | redundant_keys = all_keys.difference(self.existing_file_keys) 527 | self.logger.debug("all_keys: %s", len(all_keys)) 528 | self.logger.debug("existing_file_keys: %s", len(self.existing_file_keys)) 529 | self.logger.debug("redundant_keys: %s", len(redundant_keys)) 530 | if self.KEEP_DATA_IN_MEMORY: 531 | keys_to_simplify = redundant_keys 532 | else: 533 | keys_to_simplify = all_keys 534 | for key in keys_to_simplify: 535 | if 'head' not in self.datas[key]: # Has been simplified 536 | continue 537 | self.logger.debug("Simplify %s in memory", key) 538 | self.datas[key] = self.simplify_data(self.datas[key]) 539 | self.logger.debug("Datas in memory: ") 540 | for key, value in self.datas.items(): 541 | self.logger.debug("%s: %s keys, size %s", key, len(value), sys.getsizeof(value)) 542 | 543 | # Remove data of deleted log to reduce the size of the stats.json file 544 | if len(all_keys) > self.JOBS_TO_KEEP and redundant_keys: 545 | self.logger.debug("JOBS_TO_KEEP: %s", self.JOBS_TO_KEEP) 546 | self.logger.debug("Limit the size of all_keys in memory: %s", len(all_keys)) 547 | for key in redundant_keys: 548 | self.datas.pop(key) 549 | self.logger.debug("Pop key: %s", key) 550 | self.logger.debug("Now all_keys in memory: %s", len(self.datas)) 551 | else: 552 | self.logger.debug("all_keys in memory: %s", len(self.datas)) 553 | -------------------------------------------------------------------------------- /logparser/run.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import argparse 3 | import os 4 | import sys 5 | import time 6 | 7 | from . import SETTINGS_PY_PATH 8 | from .__version__ import __description__, __url__, __version__ 9 | from .logparser import LogParser 10 | from .utils import check_update, custom_settings, get_logger 11 | 12 | 13 | logger = get_logger('logparser.run') # __name__ 14 | 15 | STAR = '\n%s\n' % ('*' * 100) 16 | 17 | 18 | def main(): 19 | logger.info("LogParser version: %s", __version__) 20 | logger.info("Run 'logparser -h' to get help") 21 | logger.info("Main pid: %s", os.getpid()) 22 | logger.info("Check out the config file below for more advanced settings.") 23 | print(u"{star}Loading settings from {path}{star}".format(star=STAR, path=SETTINGS_PY_PATH.replace('\\', '/'))) 24 | args = parse_args() 25 | # "logparser -h" ends up here 26 | try: 27 | update_config(args) 28 | except AssertionError as err: 29 | logger.error("Check config fail: ") 30 | sys.exit(u"\n{err}\nCheck and update your settings in {path}\n".format( 31 | err=err, path=SETTINGS_PY_PATH.replace('\\', '/'))) 32 | print("{star}Visit stats at: http://{server}/logs/stats.json{star}".format( 33 | star=STAR, server=custom_settings['scrapyd_server'])) 34 | # if not custom_settings.get('main_pid', 0): 35 | check_update() 36 | logparser = LogParser(**custom_settings) 37 | time.sleep(3) 38 | logparser.main() 39 | 40 | 41 | def parse_args(): 42 | parser = argparse.ArgumentParser(description="LogParser -- %s\nGitHub: %s" % (__description__, __url__)) 43 | 44 | scrapyd_server = custom_settings.get('scrapyd_server', '') or '127.0.0.1:6800' 45 | parser.add_argument( 46 | '-ss', '--scrapyd_server', 47 | default=scrapyd_server, 48 | help=("current: {server}, e.g. 127.0.0.1:6800, the stats of Scrapyd jobs can be accessed at: " 49 | "http://{server}/logs/stats.json").format(server=scrapyd_server) 50 | ) 51 | 52 | scrapyd_logs_dir = custom_settings.get('scrapyd_logs_dir', '') or os.path.join(os.path.expanduser('~'), 'logs') 53 | parser.add_argument( 54 | '-dir', '--scrapyd_logs_dir', 55 | default=scrapyd_logs_dir, 56 | help=("current: %s, e.g. C:/Users/username/logs/ or /home/username/logs/, " 57 | "Check out this link to find out where the Scrapy logs are stored: " 58 | "https://scrapyd.readthedocs.io/en/stable/config.html#logs-dir") % scrapyd_logs_dir 59 | ) 60 | 61 | parse_round_interval = custom_settings.get('parse_round_interval', 10) 62 | parser.add_argument( 63 | '-t', '--sleep', 64 | default=parse_round_interval, 65 | help="current: %s, sleep N seconds before starting next round of parsing logs." % parse_round_interval 66 | ) 67 | 68 | enable_telnet = custom_settings.get('enable_telnet', True) 69 | parser.add_argument( 70 | '-dt', '--disable_telnet', 71 | action='store_true', 72 | help=("current: ENABLE_TELNET = %s, append '--disable_telnet' to disable collecting " 73 | "Crawler.stats and Crawler.engine via telnet") % enable_telnet 74 | ) 75 | 76 | delete_existing_json_files_at_startup = custom_settings.get('delete_existing_json_files_at_startup', True) 77 | parser.add_argument( 78 | '-del', '--delete_json_files', 79 | action='store_true', 80 | help=("current: DELETE_EXISTING_JSON_FILES_AT_STARTUP = %s, append '--delete_json_files' " 81 | "to delete existing parsed results at startup" % delete_existing_json_files_at_startup) 82 | ) 83 | 84 | verbose = custom_settings.get('verbose', False) 85 | parser.add_argument( 86 | '-v', '--verbose', 87 | action='store_true', 88 | help=("current: VERBOSE = %s, append '--verbose' to set the logging level to DEBUG " 89 | "for getting more information about how LogParser works") % verbose 90 | ) 91 | 92 | parser.add_argument( 93 | '--main_pid', 94 | default=0, 95 | help="current: 0, reserved for running as a subprocess of ScrapydWeb, just ignore this argument" 96 | ) 97 | 98 | return parser.parse_args() 99 | 100 | 101 | def update_config(args): 102 | logger.debug("Reading settings from command line: %s", args) 103 | logger.debug("Checking config") 104 | 105 | custom_settings['scrapyd_server'] = args.scrapyd_server 106 | logger.info("SCRAPYD_SERVER: %s", custom_settings['scrapyd_server']) 107 | 108 | scrapyd_logs_dir = args.scrapyd_logs_dir 109 | assert os.path.isdir(scrapyd_logs_dir), "SCRAPYD_LOGS_DIR not found: %s" % repr(scrapyd_logs_dir) 110 | 111 | custom_settings['scrapyd_logs_dir'] = scrapyd_logs_dir 112 | logger.info("SCRAPYD_LOGS_DIR: %s", custom_settings['scrapyd_logs_dir']) 113 | 114 | parse_round_interval = args.sleep 115 | try: 116 | # ValueError: invalid literal for int() with base 10: '0.1' 117 | assert int(parse_round_interval) >= 0 118 | except (TypeError, ValueError, AssertionError): # [], '' 119 | assert False, "PARSE_ROUND_INTERVAL should be a non-negative integer: %s" % repr(parse_round_interval) 120 | custom_settings['parse_round_interval'] = int(parse_round_interval) 121 | logger.info("PARSE_ROUND_INTERVAL: %s", custom_settings['parse_round_interval']) 122 | 123 | # action='store_true': default False 124 | if args.disable_telnet: 125 | custom_settings['enable_telnet'] = False 126 | logger.info("ENABLE_TELNET: %s", custom_settings['enable_telnet']) 127 | 128 | if args.delete_json_files: 129 | custom_settings['delete_existing_json_files_at_startup'] = True 130 | logger.info("DELETE_EXISTING_JSON_FILES_AT_STARTUP: %s", custom_settings['delete_existing_json_files_at_startup']) 131 | 132 | if args.verbose: 133 | custom_settings['verbose'] = True 134 | logger.info("VERBOSE: %s", custom_settings['verbose']) 135 | 136 | main_pid = args.main_pid 137 | try: 138 | assert int(main_pid) >= 0 139 | except (TypeError, ValueError, AssertionError): # [], '' 140 | assert False, "main_pid should be a non-negative integer: %s" % repr(main_pid) 141 | custom_settings['main_pid'] = int(main_pid) 142 | if custom_settings['main_pid']: 143 | logger.info("main_pid: %s", custom_settings['main_pid']) 144 | 145 | 146 | if __name__ == '__main__': 147 | main() 148 | -------------------------------------------------------------------------------- /logparser/scrapylogparser.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from collections import OrderedDict 3 | from datetime import datetime 4 | import re 5 | import time 6 | 7 | from .__version__ import __version__ 8 | from .common import Common 9 | 10 | 11 | def parse(text, headlines=100, taillines=200): 12 | """Parse the content of a Scrapy logfile and return an OrderedDict object. 13 | 14 | :param text: content to be parsed, in unicode. 15 | :param headlines: (optional) extracted the first N lines, the default is 100. 16 | :param taillines: (optional) extracted the last N lines, the default is 200. 17 | :return: an OrderedDict object 18 | :rtype: collections.OrderedDict 19 | 20 | Usage:: 21 | 22 | >>> from logparser import parse 23 | >>> d = parse(u'The content of a Scrapy logfile, in unicode.') 24 | """ 25 | return ScrapyLogParser(text, headlines, taillines).main() 26 | 27 | 28 | # noinspection PyTypeChecker 29 | class ScrapyLogParser(Common): 30 | 31 | def __init__(self, text, headlines=100, taillines=200): 32 | text = text.strip() 33 | self.data = OrderedDict() 34 | self.lines = re.split(self.LINESEP_PATTERN, text) 35 | self.data['head'] = '\n'.join(self.lines[:headlines]) 36 | self.data['tail'] = '\n'.join(self.lines[-taillines:]) 37 | # Modify text for self.DATAS_PATTERN, self.LOG_CATEGORIES_PATTERN_DICT 38 | self.text = '\n%s\n2019-01-01 00:00:01 [] DEBUG' % text 39 | 40 | def main(self): 41 | self.extract_time() 42 | self.extract_datas() 43 | self.extract_latest_matches() 44 | self.extract_log_categories() 45 | self.extract_shutdown_reason() 46 | self.extract_stats_dumped() 47 | self.data['last_update_time'], self.data['last_update_timestamp'] = self.get_current_time_timestamp() 48 | self.data['logparser_version'] = __version__ 49 | return self.data 50 | 51 | def re_search_final_match(self, pattern, default='', step=-1): 52 | for line in self.lines[::step]: 53 | if re.search(pattern, line): 54 | return line 55 | return default 56 | 57 | @staticmethod 58 | def string_to_datetime_obj(string): 59 | return datetime.strptime(string, '%Y-%m-%d %H:%M:%S') 60 | 61 | @staticmethod 62 | def datetime_obj_to_timestamp(datetime_obj): 63 | """ 64 | :param datetime_obj: datetime.datetime 65 | :rtype: int object 66 | """ 67 | return int(time.mktime(datetime_obj.timetuple())) 68 | 69 | def extract_time(self): 70 | self.data['first_log_time'] = self.re_search_final_match(r'^%s[ ]' % self.DATETIME_PATTERN, step=1)[:19] 71 | self.data['latest_log_time'] = self.re_search_final_match(r'^%s[ ]' % self.DATETIME_PATTERN)[:19] 72 | 73 | if self.data['first_log_time'] and self.data['latest_log_time']: 74 | first_log_datetime = self.string_to_datetime_obj(self.data['first_log_time']) 75 | latest_log_datetime = self.string_to_datetime_obj(self.data['latest_log_time']) 76 | self.data['runtime'] = str(latest_log_datetime - first_log_datetime) 77 | self.data['first_log_timestamp'] = self.datetime_obj_to_timestamp(first_log_datetime) 78 | self.data['latest_log_timestamp'] = self.datetime_obj_to_timestamp(latest_log_datetime) 79 | else: 80 | self.data['first_log_time'] = self.NA 81 | self.data['latest_log_time'] = self.NA 82 | self.data['runtime'] = self.NA 83 | self.data['first_log_timestamp'] = 0 84 | self.data['latest_log_timestamp'] = 0 85 | 86 | # Extract datas for chart 87 | def extract_datas(self): 88 | datas = re.findall(self.DATAS_PATTERN, self.text) 89 | # For compatibility with Python 2, str(time_) to avoid [u'2019-01-01 00:00:01', 0, 0, 0, 0] in JavaScript 90 | self.data['datas'] = [[str(time_), int(pages), int(pages_min), int(items), int(items_min)] 91 | for (time_, pages, pages_min, items, items_min) in datas] 92 | # TODO: Crawled (200) 69 | except Exception as err: 70 | self.logger.error("Fail to telnet to %s:%s for %s (%s). Maybe the job was stopped: %s", 71 | self.host, self.port, self.data['log_path'], self.scrapy_version, err) 72 | if self.verbose: 73 | self.logger.error(traceback.format_exc()) 74 | finally: 75 | if self.tn is not None: 76 | try: 77 | self.tn.close() 78 | except: 79 | pass 80 | self.tn = None 81 | 82 | return self.crawler_stats, self.crawler_engine 83 | 84 | # https://stackoverflow.com/questions/18547412/python-telnetlib-to-connect-to-scrapy-telnet-to-read-stats 85 | def run(self): 86 | self.logger.debug("scrapy_version: %s", self.scrapy_version) 87 | # Telnet via pexpect would cause '[twisted] CRITICAL: Unhandled Error' in Scrapy log on Fedora: 88 | # twisted/conch/telnet.py line 585, in dataReceived 89 | # raise ValueError("Stumped", b) 90 | # builtins.ValueError: ('Stumped', b'\\xec') 91 | if (self.ON_WINDOWS or self.on_fedora) and self.scrapy_version > SUPPORTED_SCRAPY_VERSION: 92 | self.logger.error("Telnet only supports scrapy<=%s if you are running Scrapyd on Windows and Fedora, " 93 | "current scrapy_version: %s", SUPPORTED_SCRAPY_VERSION, self.scrapy_version) 94 | return 95 | # Telnet console listening on 127.0.0.1:6023 96 | m = re.search(r'^(.+):(\d+)$', self.telnet_console) 97 | if not m: 98 | self.logger.warning("Fail to extract host and port from %s", self.telnet_console) 99 | return 100 | self.host, self.port = m.groups() 101 | self.host = self.OVERRIDE_TELNET_CONSOLE_HOST or self.host 102 | 103 | self.logger.debug("Try to telnet to %s:%s for %s", self.host, self.port, self.data['log_path']) 104 | if self.telnet_password or telnetlib is None: 105 | self.setup_pexpect() 106 | if self.tn is not None: 107 | self.pexpect_io() 108 | else: 109 | self.setup_telnet() 110 | if self.tn is not None: 111 | self.telnet_io() 112 | 113 | def setup_pexpect(self): 114 | # Cannot catch error directly here, see main() 115 | self.tn = pexpect.spawn('telnet %s %s' % (self.host, self.port), encoding='utf-8', timeout=TELNET_TIMEOUT) 116 | # logfile: ', mode 'w' at 0x7fe160149150> 117 | # logfile_read: None 118 | # logfile_send: None 119 | if self.verbose: 120 | self.tn.logfile = sys.stdout 121 | else: 122 | self.tn.logfile = io.open(os.path.join(self.CWD, TELNET_LOG_FILE), 'w') 123 | 124 | @staticmethod 125 | def telnet_callback(tn, command, option): 126 | if command == telnetlib.DO and option == telnetlib.TTYPE: 127 | tn.sendall(telnetlib.IAC + telnetlib.WILL + telnetlib.TTYPE) 128 | tn.sendall(telnetlib.IAC + telnetlib.SB + telnetlib.TTYPE + '\0' + 'LogParser' + telnetlib.IAC + telnetlib.SE) 129 | elif command in (telnetlib.DO, telnetlib.DONT): 130 | tn.sendall(telnetlib.IAC + telnetlib.WILL + option) 131 | elif command in (telnetlib.WILL, telnetlib.WONT): 132 | tn.sendall(telnetlib.IAC + telnetlib.DO + option) 133 | 134 | def setup_telnet(self): 135 | self.tn = telnetlib.Telnet(self.host, int(self.port), timeout=TELNET_TIMEOUT) 136 | # [twisted] CRITICAL: Unhandled Error 137 | # Failure: twisted.conch.telnet.OptionRefused: twisted.conch.telnet.OptionRefused 138 | # https://github.com/jookies/jasmin-web/issues/2 139 | self.tn.set_option_negotiation_callback(self.telnet_callback) 140 | if self.verbose: 141 | self.tn.set_debuglevel(logging.DEBUG) 142 | 143 | def parse_output(self, text): 144 | m = re.search(r'{.+}', text) 145 | if m: 146 | result = self.parse_crawler_stats(m.group()) 147 | else: 148 | lines = [line for line in re.split(r'\r\n|\n|\r', text) if ':' in line] 149 | result = dict([re.split(r'\s*:\s*', line, maxsplit=1) for line in lines]) 150 | for k, v in result.items(): 151 | if k == 'engine.spider.name': 152 | continue 153 | elif v == 'True': 154 | result[k] = True 155 | elif v == 'False': 156 | result[k] = False 157 | else: 158 | try: 159 | result[k] = int(float(v)) 160 | except (TypeError, ValueError): 161 | pass 162 | if result: 163 | return self.get_ordered_dict(result, source='telnet') 164 | else: 165 | return {} 166 | 167 | def pexpect_io(self): 168 | def bytes_to_str(src): 169 | if self.PY2: 170 | return src 171 | return src.decode('utf-8') 172 | # TypeError: got ('Username: ') as pattern, 173 | # must be one of: , pexpect.EOF, pexpect.TIMEOUT 174 | if self.telnet_password: 175 | self.tn.expect(u'Username: ', timeout=TELNET_TIMEOUT) 176 | self.tn.sendline(self.telnet_username) 177 | self.tn.expect(u'Password: ', timeout=TELNET_TIMEOUT) 178 | self.tn.sendline(self.telnet_password) 179 | self.tn.expect(u'>>>', timeout=TELNET_TIMEOUT) 180 | self.logger.debug("Login successfully") 181 | else: 182 | self.tn.expect(u'>>>', timeout=TELNET_TIMEOUT) 183 | self.logger.debug("Connect successfully") 184 | 185 | self.tn.sendline(bytes_to_str(TELNETCONSOLE_COMMAND_MAP['log_file'])) 186 | self.tn.expect(re.compile(r'[\'"].+>>>', re.S), timeout=TELNET_TIMEOUT) 187 | log_file = self.tn.after 188 | self.logger.debug("settings['LOG_FILE'] found via telnet: %s", log_file) 189 | if not self.verify_log_file_path(self.parse_log_path(self.data['log_path']), log_file): 190 | self.logger.warning("Skip telnet due to mismatching: %s AND %s", self.data['log_path'], log_file) 191 | return 192 | 193 | self.tn.sendline(bytes_to_str(TELNETCONSOLE_COMMAND_MAP['crawler_stats'])) 194 | self.tn.expect(re.compile(r'{.+>>>', re.S), timeout=TELNET_TIMEOUT) 195 | self.crawler_stats = self.parse_output(self.tn.after) 196 | 197 | self.tn.sendline(bytes_to_str(TELNETCONSOLE_COMMAND_MAP['crawler_engine'])) 198 | self.tn.expect(re.compile(r'Execution engine status.+>>>', re.S), timeout=TELNET_TIMEOUT) 199 | self.crawler_engine = self.parse_output(self.tn.after) 200 | 201 | def _telnet_io(self, command): 202 | # Microsoft Telnet> o 203 | # ( to )127.0.0.1 6023 204 | # >>>stats.get_stats() 205 | # >>>est() 206 | self.tn.write(b'%s\n' % command) 207 | content = self.tn.read_until(b'\n>>>', timeout=TELNET_TIMEOUT) 208 | # print(repr(content)) 209 | # b"\x1bc>>> \x1b[4hstats.get_stats()\r\r\r\n{'log_count/INFO': 61, 210 | # 'start_time': datetime.datetime(2019, 1, 22, 9, 7, 14, 998126), 211 | # 'httperror/response_ignored_status_count/404': 1}\r\r\r\n>>>" 212 | # b' est()\r\r\r\nExecution engine status\r\r\r\n\r\r\r\n 213 | # time()-engine.start_time : 3249.7548048496246 214 | # engine.scraper.slot.needs_backout() : False\r\r\r\n\r\r\r\n\r\r\r\n>>>' 215 | return content.decode('utf-8') 216 | 217 | def telnet_io(self): 218 | # spider._job, spider._version, settings.attributes["BOT_NAME"].value, JOB, SPIDER, PROJECT 219 | # '\'logs\\\\demo_persistent\\\\test\\\\2019-01-23T18_25_34.log\'\r\r\r\n>>>' 220 | log_file = self._telnet_io(TELNETCONSOLE_COMMAND_MAP['log_file']) 221 | self.logger.debug("settings['LOG_FILE'] found via telnet: %s", log_file) 222 | # Username: Password: 223 | if 'Username:' in log_file: 224 | self.logger.error("Telnet with auth is not supported on Windows. You can use scrapy<=%s instead: %s", 225 | SUPPORTED_SCRAPY_VERSION, log_file) 226 | return 227 | if not self.verify_log_file_path(self.parse_log_path(self.data['log_path']), log_file): 228 | self.logger.warning("Skip telnet due to mismatching: %s vs %s", self.data['log_path'], log_file) 229 | return 230 | self.crawler_stats = self.parse_output(self._telnet_io(TELNETCONSOLE_COMMAND_MAP['crawler_stats'])) 231 | self.crawler_engine = self.parse_output(self._telnet_io(TELNETCONSOLE_COMMAND_MAP['crawler_engine'])) 232 | 233 | def verify_log_file_path(self, parts, log_file): 234 | for part in parts: 235 | if part not in log_file: 236 | self.logger.warning("%s not found in settings['LOG_FILE']: %s", part, log_file) 237 | return False 238 | return True 239 | -------------------------------------------------------------------------------- /logparser/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | import logging 4 | import platform 5 | import sys 6 | 7 | try: 8 | from scrapy import __version__ as scrapy_version 9 | except ImportError: 10 | scrapy_version = '0.0.0' 11 | from six.moves.urllib.parse import urlencode 12 | from six.moves.urllib.request import Request, urlopen 13 | 14 | from .__version__ import __version__ 15 | from .settings import (SCRAPYD_SERVER, SCRAPYD_LOGS_DIR, PARSE_ROUND_INTERVAL, 16 | ENABLE_TELNET, OVERRIDE_TELNET_CONSOLE_HOST, LOG_ENCODING, LOG_EXTENSIONS, 17 | LOG_HEAD_LINES, LOG_TAIL_LINES, LOG_CATEGORIES_LIMIT, JOBS_TO_KEEP, CHUNK_SIZE, 18 | DELETE_EXISTING_JSON_FILES_AT_STARTUP, KEEP_DATA_IN_MEMORY, VERBOSE) 19 | 20 | 21 | custom_settings = dict( 22 | scrapyd_server=SCRAPYD_SERVER, 23 | scrapyd_logs_dir=SCRAPYD_LOGS_DIR, 24 | parse_round_interval=PARSE_ROUND_INTERVAL, 25 | enable_telnet=ENABLE_TELNET, 26 | override_telnet_console_host=OVERRIDE_TELNET_CONSOLE_HOST, 27 | log_encoding=LOG_ENCODING, 28 | log_extensions=LOG_EXTENSIONS, 29 | log_head_lines=LOG_HEAD_LINES, 30 | log_tail_lines=LOG_TAIL_LINES, 31 | log_categories_limit=LOG_CATEGORIES_LIMIT, 32 | jobs_to_keep=JOBS_TO_KEEP, 33 | chunk_size=CHUNK_SIZE, 34 | delete_existing_json_files_at_startup=DELETE_EXISTING_JSON_FILES_AT_STARTUP, 35 | keep_data_in_memory=KEEP_DATA_IN_MEMORY, 36 | verbose=VERBOSE, 37 | # main_pid=0, 38 | # debug=False, 39 | # exit_timeout=0 40 | ) 41 | 42 | 43 | def get_logger(name, level=logging.DEBUG): 44 | logger = logging.getLogger(name) 45 | handler = logging.StreamHandler() 46 | formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)-8s in %(name)s: %(message)s") 47 | handler.setFormatter(formatter) 48 | logger.addHandler(handler) 49 | logger.setLevel(level) 50 | return logger 51 | 52 | 53 | def check_update(timeout=5, **kwargs): 54 | logger = get_logger(__name__) 55 | js = {} 56 | try: 57 | data = dict(custom_settings) 58 | data['os'] = platform.platform() 59 | data['py'] = '.'.join([str(n) for n in sys.version_info[:3]]) 60 | data['logparser'] = __version__ 61 | data['scrapy_version'] = scrapy_version 62 | data.update(kwargs) 63 | # print(data) 64 | url = 'https://my8100.pythonanywhere.com/check_update' 65 | json_data = json.dumps(data).encode('utf-8') 66 | req = Request(url, data=json_data) 67 | req.add_header('Content-Type', 'application/json') 68 | with urlopen(req, timeout=timeout) as resp: 69 | text = resp.read().decode('utf-8', 'replace') 70 | # print(text) 71 | js = json.loads(text) 72 | # print(js) 73 | # except Exception as err: 74 | # print(err) 75 | except: 76 | pass 77 | else: 78 | if js.get('latest_version') == __version__: 79 | logger.info("Running the latest version: %s", __version__) 80 | else: 81 | if js.get('info', ''): 82 | logger.warning(js['info']) 83 | if js.get('force_update', ''): 84 | sys.exit("Please update and then restart logparser. ") 85 | return js # For test only 86 | -------------------------------------------------------------------------------- /requirements-tests.txt: -------------------------------------------------------------------------------- 1 | pip>=19.1.1 2 | flake8 3 | coverage 4 | pytest 5 | # pytest-cov 6 | coveralls 7 | allure-pytest 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pip>=19.1.1 2 | pexpect>=4.7.0 3 | six>=1.12.0 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import io 3 | import os 4 | import re 5 | 6 | from setuptools import find_packages, setup 7 | 8 | 9 | CWD = os.path.dirname(os.path.abspath(__file__)) 10 | 11 | about = {} 12 | with open(os.path.join(CWD, 'logparser', '__version__.py')) as f: 13 | exec(f.read(), about) 14 | 15 | with io.open("README.md", 'r', encoding='utf-8') as f: 16 | long_description = re.sub(r':\w+:\s', '', f.read()) # Remove emojis 17 | 18 | 19 | setup( 20 | name=about['__title__'], 21 | version=about['__version__'], 22 | author=about['__author__'], 23 | author_email=about['__author_email__'], 24 | url=about['__url__'], 25 | license=about['__license__'], 26 | description=about['__description__'], 27 | 28 | long_description=long_description, 29 | long_description_content_type="text/markdown", 30 | 31 | packages=find_packages(exclude=("tests", )), 32 | include_package_data=True, 33 | zip_safe=False, 34 | python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*", 35 | install_requires=[ 36 | "pexpect >= 4.7.0", # Apr 7, 2019 37 | "six >= 1.12.0", # Dec 10, 2018 38 | ], 39 | 40 | entry_points={ 41 | "console_scripts": { 42 | "logparser = logparser.run:main" 43 | } 44 | }, 45 | 46 | classifiers=[ 47 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 48 | "Programming Language :: Python :: 2.7", 49 | "Programming Language :: Python :: 3.6", 50 | "Programming Language :: Python :: 3.7", 51 | "Programming Language :: Python :: 3.8", 52 | "Programming Language :: Python :: 3.9", 53 | "Programming Language :: Python :: 3.10", 54 | "Programming Language :: Python :: 3.11", 55 | "Programming Language :: Python :: 3.12", 56 | "Programming Language :: Python :: 3.13", 57 | ] 58 | ) 59 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/my8100/logparser/31e2661617d8f66b37d55c1cf49c225ea11fa6b0/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Set 'jobs_to_keep = 100' in the config file of Scrapyd before running the test. 4 | https://scrapyd.readthedocs.io/en/latest/config.html#jobs-to-keep 5 | jobs_to_keep: The number of finished jobs to keep per spider. Defaults to 5. This refers to logs and items. 6 | finished_to_keep: The number of finished processes to keep in the launcher. Defaults to 100. 7 | """ 8 | import os 9 | from shutil import rmtree 10 | import zipfile 11 | 12 | import pytest 13 | 14 | from logparser.logparser import LogParser 15 | from tests.demo_log import FRONT, END 16 | from tests.utils import cst, SETTINGS 17 | 18 | 19 | """ 20 | from tests.utils import settings # settings = dict(SETTINGS) shared between test functions 21 | @pytest.fixture 22 | def parser(): 23 | print('parser fixture') 24 | print(settings) 25 | 26 | if os.path.isdir(cst.LOGS_PATH): 27 | rmtree(cst.LOGS_PATH, ignore_errors=True) 28 | with zipfile.ZipFile(cst.LOGS_ZIP_PATH, 'r') as f: 29 | f.extractall(cst.CWD) 30 | 31 | parser = LogParser(**settings) 32 | yield parser 33 | 34 | def test_default_settings(parser): 35 | parser.main() 36 | """ 37 | 38 | 39 | # https://stackoverflow.com/questions/18011902/py-test-pass-a-parameter-to-a-fixture-function 40 | @pytest.fixture 41 | def psr(): 42 | def new_a_parser(execute_main=True, reset_logs=True, **kwargs): 43 | if reset_logs: 44 | if os.path.isdir(cst.LOGS_PATH): 45 | rmtree(cst.LOGS_PATH, ignore_errors=True) 46 | import time 47 | time.sleep(1) 48 | with zipfile.ZipFile(cst.LOGS_ZIP_PATH, 'r') as f: 49 | f.extractall(cst.CWD) 50 | cst.write_text(cst.LOG_PATH, FRONT + END) 51 | cst.write_text(cst.TXT_PATH, FRONT + END) 52 | 53 | settings = dict(SETTINGS) 54 | if kwargs: 55 | settings.update(kwargs) 56 | print(settings) 57 | 58 | parser = LogParser(**settings) 59 | if execute_main: 60 | parser.main() 61 | return parser 62 | 63 | yield new_a_parser 64 | -------------------------------------------------------------------------------- /tests/demo_log.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | ERROR_404 = u""" 4 | 5 | 404 - No Such Resource 6 | 7 |

No Such Resource

8 |

File not found.

9 | 10 | 11 | """ 12 | 13 | SHUTDOWN = u"""2019-01-01 00:00:01 [scrapy.crawler] INFO: Received SIGTERM, shutting down gracefully. Send again to force 14 | 2019-01-01 00:00:01 [scrapy.core.engine] INFO: Closing spider (shutdown) 15 | 2019-01-01 00:00:01 [scrapy.crawler] INFO: Received SIGTERM twice, forcing unclean shutdown 16 | 2019-01-01 00:00:01 [scrapy.statscollectors] INFO: Dumping Scrapy stats: 17 | { 18 | 'finish_reason': 'shutdown', 19 | "robotstxt/exception_count/": 1, 20 | 'start_time': datetime.datetime(2019, 3, 9, 13, 55, 24, 601697) 21 | } 22 | 2019-01-01 00:00:01 [scrapy.core.engine] INFO: Spider closed (finished)""" 23 | 24 | TELNET_160_DEFAULT = u"""2019-06-11 15:53:48 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: ScrapydWeb_demo) 25 | 2019-06-11 15:53:48 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'ScrapydWeb_demo', 'FEED_EXPORT_ENCODING': 'utf-8', 'FEED_URI': 'file:///C:/Users/win7/items/ScrapydWeb_demo/test/2019-06-11T15_53_43.jl', 'LOG_FILE': 'logs\\ScrapydWeb_demo\\test\\2019-06-11T15_53_43.log', 'NEWSPIDER_MODULE': 'ScrapydWeb_demo.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['ScrapydWeb_demo.spiders']} 26 | 2019-06-11 15:53:48 [scrapy.extensions.telnet] INFO: Telnet Password: 9d3a29f17ee1bf9a 27 | 2019-06-11 15:53:49 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6024 28 | """ 29 | 30 | TELNET_160_USERNAME = u"""2019-06-11 16:05:38 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: ScrapydWeb_demo) 31 | 2019-06-11 16:05:38 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'ScrapydWeb_demo', 'FEED_EXPORT_ENCODING': 'utf-8', 'FEED_URI': 'file:///C:/Users/win7/items/ScrapydWeb_demo/test/2019-06-11T16_05_09.jl', 'LOG_FILE': 'logs\\ScrapydWeb_demo\\test\\2019-06-11T16_05_09.log', 'NEWSPIDER_MODULE': 'ScrapydWeb_demo.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['ScrapydWeb_demo.spiders'], 'TELNETCONSOLE_USERNAME': 'usr123'} 32 | 2019-06-11 16:05:38 [scrapy.extensions.telnet] INFO: Telnet Password: d24ad6be287d69b3 33 | 2019-06-11 16:05:38 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6024 34 | """ 35 | 36 | TELNET_160_PASSWORD = u"""2019-06-11 16:08:44 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: ScrapydWeb_demo) 37 | 2019-06-11 16:08:44 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'ScrapydWeb_demo', 'FEED_EXPORT_ENCODING': 'utf-8', 'FEED_URI': 'file:///C:/Users/win7/items/ScrapydWeb_demo/test/2019-06-11T16_07_57.jl', 'LOG_FILE': 'logs\\ScrapydWeb_demo\\test\\2019-06-11T16_07_57.log', 'NEWSPIDER_MODULE': 'ScrapydWeb_demo.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['ScrapydWeb_demo.spiders'], 'TELNETCONSOLE_PASSWORD': '456psw'} 38 | 2019-06-11 16:08:44 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6024 39 | """ 40 | 41 | TELNET_160_USERNAME_PASSWORD = u"""2019-06-11 16:15:13 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: ScrapydWeb_demo) 42 | 2019-06-11 16:15:13 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'ScrapydWeb_demo', 'FEED_EXPORT_ENCODING': 'utf-8', 'FEED_URI': 'file:///C:/Users/win7/items/ScrapydWeb_demo/test/2019-06-11T16_14_36.jl', 'LOG_FILE': 'logs\\ScrapydWeb_demo\\test\\2019-06-11T16_14_36.log', 'NEWSPIDER_MODULE': 'ScrapydWeb_demo.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['ScrapydWeb_demo.spiders'], 'TELNETCONSOLE_PASSWORD': '456psw', 'TELNETCONSOLE_USERNAME': 'usr123'} 43 | 2019-06-11 16:15:14 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6024 44 | """ 45 | 46 | TELNET_151_NO_PORT = u"""2019-06-15 11:53:00 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: demo_project) 47 | 2019-06-15 11:53:01 [scrapy.extensions.telnet] DEBUG: Telnet console listening on localhost 48 | 2019-06-15 11:53:02 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 49 | """ 50 | 51 | TELNET_151_PORT_16023 = u"""2019-06-15 11:53:00 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: demo_project) 52 | 2019-06-15 11:53:01 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:16023 53 | 2019-06-15 11:53:02 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 54 | """ 55 | 56 | TELNET_160_PORT_16024 = u"""2019-06-15 11:53:00 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: demo_project) 57 | 2019-06-15 11:53:01 [scrapy.extensions.telnet] INFO: Telnet Password: 9d3a29f17ee1bf9a 58 | 2019-06-15 11:53:01 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:16024 59 | 2019-06-15 11:53:02 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 60 | """ 61 | 62 | LATEST_SCRAPE_ITEM_ONE_LINE = u"""2019-01-01 00:00:01 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/get> 63 | {'item': 1} 64 | """ 65 | 66 | LATEST_SCRAPE_ITEM_MULTIPLE_LINES = u"""2019-01-01 00:00:02 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/get> 67 | { 68 | 'item': 2 69 | } 70 | """ 71 | 72 | LATEST_SCRAPE_ITEM_MIXED = u"""2019-01-01 00:00:01 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/get> 73 | {'item': 1} 74 | 2019-01-01 00:00:03 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/get> 75 | { 76 | 'item': { 77 | u'Chinese \u6c49\u5b57': 3 78 | } 79 | } 80 | 2019-01-01 00:00:04 [scrapy_fieldstats.fieldstats] INFO: Field stats: 81 | {'item': '100%'} 82 | 2019-01-01 00:00:04 [scrapy.statscollectors] INFO: Dumping Scrapy stats: 83 | """ 84 | 85 | SCRAPY_FIELDSTATS = u"""2019-01-01 00:00:01 [scrapy.statscollectors] INFO: Dumping Scrapy stats: 86 | {'dupefilter/filtered': 1, 87 | 'fields_coverage': { 88 | u'Chinese \u6c49\u5b57': '50%', 89 | 'author': { 90 | 'a': 1, 91 | 'b': 2 92 | } 93 | }, 94 | 'finish_reason': 'finished'} 95 | """ 96 | 97 | FRONT = u"""2018-10-23 18:28:34 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: demo) 98 | 2018-10-23 18:28:34 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.7, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.5.0, Python 3.6.5 |Anaconda, Inc.| (default, Mar 29 2018, 13:32:41) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 17.5.0 (OpenSSL 1.0.2o 27 Mar 2018), cryptography 2.2.1, Platform Windows-7-6.1.7601-SP1 99 | 2018-10-23 18:28:34 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'demo', 'CONCURRENT_REQUESTS': 2, 'COOKIES_ENABLED': False, 'DOWNLOAD_DELAY': 1, 'FEED_URI': 'file:///C:/Users/win7/items/demo/test/2018-10-23_182826.jl', 'LOGSTATS_INTERVAL': 1, 'LOG_FILE': 'logs/demo/test/2018-10-23_182826.log', 'NEWSPIDER_MODULE': 'demo.spiders', 'SPIDER_MODULES': ['demo.spiders'], 'USER_AGENT': 'Mozilla/5.0'} 100 | 2018-10-23 18:28:34 [scrapy.middleware] INFO: Enabled extensions: 101 | ['scrapy.extensions.corestats.CoreStats', 102 | 'scrapy.extensions.telnet.TelnetConsole', 103 | 'scrapy.extensions.feedexport.FeedExporter', 104 | 'scrapy.extensions.logstats.LogStats'] 105 | 2018-10-23 18:28:35 [scrapy.middleware] INFO: Enabled downloader middlewares: 106 | ['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', 107 | 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', 108 | 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', 109 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', 110 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware', 111 | 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', 112 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', 113 | 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', 114 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware', 115 | 'scrapy.downloadermiddlewares.stats.DownloaderStats'] 116 | 2018-10-23 18:28:35 [scrapy.middleware] INFO: Enabled spider middlewares: 117 | ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', 118 | 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', 119 | 'scrapy.spidermiddlewares.referer.RefererMiddleware', 120 | 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', 121 | 'scrapy.spidermiddlewares.depth.DepthMiddleware'] 122 | 2018-10-23 18:28:35 [scrapy.middleware] INFO: Enabled item pipelines: 123 | [] 124 | 2018-10-23 18:28:35 [scrapy.core.engine] INFO: Spider opened 125 | 2018-10-23 18:28:35 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 126 | 2018-10-23 18:28:35 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023 127 | 2018-10-23 18:28:35 [test] DEBUG: test utf8: 测试中文 128 | 2018-10-23 18:28:35 [test] DEBUG: 2018-08-20 09:13:06 [apps_redis] DEBUG: Resuming crawl (675840 requests scheduled) 129 | 2018-10-23 18:28:35 [test] WARNING: warn 130 | 2018-10-23 18:28:35 [test] ERROR: error 131 | 2018-10-23 18:28:35 [test] WARNING: warning 132 | 123abc 133 | 2018-10-23 18:28:35 [test] ERROR: error 134 | 456abc 135 | 2018-10-23 18:28:35 [test] ERROR: error 136 | 456abc 137 | 2018-10-23 18:28:35 [test] CRITICAL: critical 138 | 789abc 139 | 2018-10-23 18:28:35 [test] WARNING: warning 140 | 123 141 | abc 142 | 2018-10-23 18:28:35 [test] ERROR: error 143 | 456 144 | abc 145 | 2018-10-23 18:28:35 [test] CRITICAL: critical 146 | 2018-10-23 18:28:35 [test] CRITICAL: critical 147 | 789 148 | abc 149 | 2018-10-23 18:28:35 [test] CRITICAL: critical 150 | 2018-10-23 18:28:35 [test] CRITICAL: critical 151 | 2018-10-23 18:28:35 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to from 152 | 2018-10-23 18:28:36 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 153 | 2018-10-23 18:28:36 [scrapy.core.engine] DEBUG: Crawled (404) (referer: None) 154 | 2018-10-23 18:28:36 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <404 http://httpbin.org/status/404>: HTTP status code is not handled or not allowed 155 | 2018-10-23 18:28:37 [scrapy.extensions.logstats] INFO: Crawled 1 pages (at 60 pages/min), scraped 0 items (at 0 items/min) 156 | 2018-10-23 18:28:37 [scrapy.core.engine] DEBUG: Crawled (200) (referer: None) 157 | 2018-10-23 18:28:37 [scrapy.dupefilters] DEBUG: Filtered duplicate request: - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates) 158 | 2018-10-23 18:28:37 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.baidu.com': 159 | 2018-10-23 18:28:37 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/get> 160 | {'item': 1} 161 | 2018-10-23 18:28:38 [scrapy.extensions.logstats] INFO: Crawled 2 pages (at 60 pages/min), scraped 1 items (at 60 items/min) 162 | 2018-10-23 18:28:39 [scrapy.extensions.logstats] INFO: Crawled 2 pages (at 0 pages/min), scraped 1 items (at 0 items/min) 163 | 2018-10-23 18:28:39 [scrapy.core.engine] DEBUG: Crawled (200) (referer: None) 164 | 2018-10-23 18:28:39 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/headers> 165 | {'item': 2} 166 | 2018-10-23 18:28:40 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 60 pages/min), scraped 2 items (at 60 items/min) 167 | 2018-10-23 18:28:41 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 168 | 2018-10-23 18:28:42 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 169 | 2018-10-23 18:28:43 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 170 | 2018-10-23 18:28:44 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 171 | 2018-10-23 18:28:45 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 172 | 2018-10-23 18:28:46 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 173 | 2018-10-23 18:28:47 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 174 | 2018-10-23 18:28:48 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 175 | 2018-10-23 18:28:49 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 176 | 2018-10-23 18:28:50 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 177 | 2018-10-23 18:28:51 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 178 | 2018-10-23 18:28:52 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 179 | 2018-10-23 18:28:53 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 180 | 2018-10-23 18:28:54 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 181 | 2018-10-23 18:28:55 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 182 | 2018-10-23 18:28:56 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 183 | 2018-10-23 18:28:57 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 184 | 2018-10-23 18:28:58 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 185 | 2018-10-23 18:28:58 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying (failed 1 times): TCP connection timed out: 10060: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。. 186 | 2018-10-23 18:28:59 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 187 | 2018-10-23 18:29:00 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 188 | 2018-10-23 18:29:01 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 189 | 2018-10-23 18:29:02 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 190 | 2018-10-23 18:29:03 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 191 | 2018-10-23 18:29:04 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 192 | 2018-10-23 18:29:05 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 193 | 2018-10-23 18:29:06 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 194 | 2018-10-23 18:29:07 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 195 | 2018-10-23 18:29:08 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 196 | 2018-10-23 18:29:09 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 197 | 2018-10-23 18:29:10 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 198 | 2018-10-23 18:29:11 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 199 | 2018-10-23 18:29:12 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 200 | 2018-10-23 18:29:13 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 201 | 2018-10-23 18:29:14 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 202 | 2018-10-23 18:29:15 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 203 | 2018-10-23 18:29:16 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 204 | 2018-10-23 18:29:17 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 205 | 2018-10-23 18:29:18 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 206 | 2018-10-23 18:29:19 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 207 | 2018-10-23 18:29:19 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying (failed 2 times): TCP connection timed out: 10060: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。. 208 | 2018-10-23 18:29:20 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 209 | 2018-10-23 18:29:21 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 210 | 2018-10-23 18:29:22 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 211 | 2018-10-23 18:29:23 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 212 | 2018-10-23 18:29:24 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 213 | 2018-10-23 18:29:25 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 214 | 2018-10-23 18:29:26 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 215 | 2018-10-23 18:29:27 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 216 | 2018-10-23 18:29:28 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 217 | 2018-10-23 18:29:29 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 218 | 2018-10-23 18:29:30 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 219 | 2018-10-23 18:29:31 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 220 | 2018-10-23 18:29:32 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 221 | 2018-10-23 18:29:33 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 222 | 2018-10-23 18:29:34 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 223 | 2018-10-23 18:29:35 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 224 | 2018-10-23 18:29:36 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 225 | 2018-10-23 18:29:37 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 226 | 2018-10-23 18:29:38 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 227 | 2018-10-23 18:29:39 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 228 | 2018-10-23 18:29:40 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 229 | 2018-10-23 18:29:40 [scrapy.downloadermiddlewares.retry] DEBUG: Gave up retrying (failed 3 times): TCP connection timed out: 10060: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。. 230 | 2018-10-23 18:29:41 [scrapy.core.scraper] ERROR: Error downloading 231 | Traceback (most recent call last): 232 | File "e:/programdata/anaconda3/envs/py3/lib/site-packages/twisted/internet/defer.py", line 1384, in _inlineCallbacks 233 | result = result.throwExceptionIntoGenerator(g) 234 | File "e:/programdata/anaconda3/envs/py3/lib/site-packages/twisted/python/failure.py", line 393, in throwExceptionIntoGenerator 235 | return g.throw(self.type, self.value, self.tb) 236 | File "e:/programdata/anaconda3/envs/py3/lib/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request 237 | defer.returnValue((yield download_func(request=request,spider=spider))) 238 | twisted.internet.error.TCPTimedOutError: TCP connection timed out: 10060: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。. 239 | 2018-10-23 18:29:41 [scrapy.extensions.logstats] INFO: Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min) 240 | 2018-10-23 18:29:41 [scrapy.core.engine] INFO: Closing spider (finished) 241 | 2018-10-23 18:29:41 [scrapy.extensions.feedexport] INFO: Stored jsonlines feed (2 items) in: file:///C:/Users/win7/items/demo/test/2018-10-23_182826.jl 242 | """ 243 | 244 | END = u"""2018-10-23 18:29:41 [scrapy.statscollectors] INFO: Dumping Scrapy stats: 245 | {'downloader/exception_count': 3, 246 | 'downloader/exception_type_count/twisted.internet.error.TCPTimedOutError': 3, 247 | b'downloader/request_bytes': 13, 248 | u'downloader/request_count': 7, 249 | 'downloader/request_method_count/GET': 7, 250 | 'downloader/response_bytes': 1669, 251 | 'downloader/response_count': 4, 252 | 'downloader/response_status_count/200': 2, 253 | 'downloader/response_status_count/302': 1, 254 | 'downloader/response_status_count/404': 1, 255 | 'dupefilter/filtered': 1, 256 | 'finish_reason': 'finished', 257 | 'finish_time': datetime.datetime(2018, 10, 23, 10, 29, 41, 174719), 258 | 'httperror/response_ignored_count': 1, 259 | 'httperror/response_ignored_status_count/404': 1, 260 | 'item_scraped_count': 2, 261 | 'log_count/CRITICAL': 5, 262 | 'log_count/DEBUG': 14, 263 | 'log_count/ERROR': 5, 264 | 'log_count/INFO': 75, 265 | 'log_count/WARNING': 3, 266 | 'offsite/domains': 1, 267 | 'offsite/filtered': 1, 268 | 'request_depth_max': 1, 269 | 'response_received_count': 3, 270 | 'retry/count': 2, 271 | 'retry/max_reached': 1, 272 | 'retry/reason_count/twisted.internet.error.TCPTimedOutError': 2, 273 | 'scheduler/dequeued': 7, 274 | 'scheduler/dequeued/memory': 7, 275 | 'scheduler/enqueued': 7, 276 | 'scheduler/enqueued/memory': 7, 277 | 'start_time': datetime.datetime(2018, 10, 23, 10, 28, 35, 70938)} 278 | 2018-10-23 18:29:42 [scrapy.core.engine] INFO: Spider closed (finished)""" 279 | -------------------------------------------------------------------------------- /tests/logs.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/my8100/logparser/31e2661617d8f66b37d55c1cf49c225ea11fa6b0/tests/logs.zip -------------------------------------------------------------------------------- /tests/test_logparser.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import re 4 | import time 5 | 6 | from tests.demo_log import FRONT, END 7 | from tests.utils import cst 8 | 9 | 10 | # { 11 | # "status": "ok", 12 | # "datas": {}, 13 | # "settings_py": "logparser/logparser/settings.py", 14 | # "settings": {...}, 15 | # "last_update_timestamp": "1546272001" 16 | # "last_update_time": "2019-01-01 00:00:01" 17 | # "logparser_version": "0.8.1", 18 | # } 19 | def test_empty_logs_dir(psr): 20 | parser = psr(execute_main=False) 21 | # cst.STATS_JSON_PATH is created in Parser.__init__() 22 | for path in [cst.LOG_PATH, cst.TXT_PATH, cst.STATS_JSON_PATH]: 23 | os.remove(path) 24 | parser.main() 25 | for path in [cst.LOG_PATH, cst.TXT_PATH, cst.LOG_JSON_PATH, cst.TXT_JSON_PATH]: 26 | assert not os.path.exists(path) 27 | assert os.path.exists(cst.STATS_JSON_PATH) 28 | stats = cst.read_data(cst.STATS_JSON_PATH) 29 | default_stats = dict(status='ok', datas={}, logparser_version=cst.LOGPARSER_VERSION) 30 | assert set(stats.keys()) == {'status', 'datas', 'settings_py', 'settings', 31 | 'last_update_timestamp', 'last_update_time', 'logparser_version'} 32 | for k, v in default_stats.items(): 33 | assert stats[k] == v 34 | # last_update_time, comes from last_update_timestamp 35 | assert cst.string_to_timestamp(stats['last_update_time']) == stats['last_update_timestamp'] 36 | 37 | 38 | def test_demo_log_files(psr): 39 | psr() 40 | log_data = cst.read_data(cst.LOG_JSON_PATH) 41 | txt_data = cst.read_data(cst.TXT_JSON_PATH) 42 | for k in cst.PARSE_KEYS: 43 | if k not in ['last_update_time', 'last_update_timestamp']: 44 | assert log_data[k] == txt_data[k] 45 | 46 | # 2019-01-01T00_00_01.log 47 | # 2019-01-01T00_00_02.txt 48 | for case, data in zip(['log', 'txt'], [log_data, txt_data]): 49 | cst.check_demo_data(data) 50 | 51 | if case == 'log': 52 | job = cst.JOB 53 | ext = 'log' 54 | else: 55 | job = cst.JOB_TXT 56 | ext = 'txt' 57 | assert data['log_path'].endswith('%s.%s' % (job, ext)) 58 | assert data['json_path'].endswith('%s.json' % job) 59 | assert data['json_url'].endswith('%s.json' % job) 60 | assert data['json_url'].startswith('http://%s' % cst.SCRAPYD_SERVER) 61 | 62 | assert data['size'] == cst.SIZE 63 | assert data['position'] == cst.SIZE 64 | assert data['status'] == cst.STATUS 65 | assert data['_head'] == cst.LOG_HEAD_LINES 66 | assert data['logparser_version'] == cst.LOGPARSER_VERSION 67 | 68 | 69 | def test_log_no_change(psr): 70 | start_time = time.time() 71 | psr(parse_round_interval=1, exit_timeout=0.001) # parse for first time, exit 72 | parse_time = time.time() - start_time 73 | exit_timeout = parse_time * 3 # Ensure a sleep 74 | interval = exit_timeout + 5 75 | psr(parse_round_interval=interval, exit_timeout=exit_timeout) 76 | stats = cst.read_data(cst.STATS_JSON_PATH) 77 | data = cst.read_data(cst.LOG_JSON_PATH) 78 | assert stats['datas'][cst.PROJECT][cst.SPIDER][cst.JOB]['last_update_time'] == data['last_update_time'] 79 | # last_update_timestamp does not contain the float part of a timestamp, so add '- 2' on the right 80 | assert stats['last_update_timestamp'] - data['last_update_timestamp'] > interval - 2 81 | 82 | 83 | def test_new_file_read_data(psr): 84 | psr() 85 | log_data = cst.read_data(cst.LOG_JSON_PATH) 86 | last_update_timestamp = log_data['last_update_timestamp'] 87 | 88 | # Skip parsing since data with same size found 89 | # Old file with old size 90 | parser = psr(execute_main=False, reset_logs=False) 91 | for i in range(2): 92 | time.sleep(2) 93 | parser.main() 94 | log_data = cst.read_data(cst.LOG_JSON_PATH) 95 | assert log_data['last_update_timestamp'] == last_update_timestamp 96 | cst.check_demo_data(log_data) 97 | 98 | # Old logfile with smaller size 99 | cst.write_text(cst.LOG_PATH, FRONT + END.replace('memory', '')) 100 | parser.main() 101 | log_data = cst.read_data(cst.LOG_JSON_PATH) 102 | assert log_data['last_update_timestamp'] == last_update_timestamp 103 | cst.check_demo_data(log_data) 104 | stats = cst.read_data(cst.STATS_JSON_PATH) 105 | assert cst.PROJECT not in stats['datas'] 106 | # -> parse in next round 107 | parser.main() 108 | log_data = cst.read_data(cst.LOG_JSON_PATH) 109 | assert log_data['last_update_timestamp'] > last_update_timestamp 110 | cst.check_demo_data(log_data) 111 | stats = cst.read_data(cst.STATS_JSON_PATH) 112 | assert cst.PROJECT in stats['datas'] 113 | 114 | # Read data fail 115 | time.sleep(2) 116 | cst.write_text(cst.LOG_JSON_PATH, u'') 117 | psr(reset_logs=False) 118 | log_data = cst.read_data(cst.LOG_JSON_PATH) 119 | assert log_data['last_update_timestamp'] > last_update_timestamp 120 | cst.check_demo_data(log_data) 121 | 122 | 123 | def test_new_size_read_data(psr): 124 | appended_log = u'test' 125 | appended_log_length = len(appended_log) 126 | parser = psr() 127 | log_data = cst.read_data(cst.LOG_JSON_PATH) 128 | assert log_data['logparser_version'] == cst.LOGPARSER_VERSION 129 | cst.check_demo_data(log_data) 130 | last_update_timestamp = log_data['last_update_timestamp'] 131 | 132 | # Valid but short appended log 133 | cst.write_text(cst.LOG_PATH, appended_log, append=True) 134 | time.sleep(2) 135 | parser.main() 136 | assert os.path.getsize(cst.APPENDED_LOG_PATH) == 0 137 | log_data = cst.read_data(cst.LOG_JSON_PATH) 138 | assert log_data['last_update_timestamp'] > last_update_timestamp 139 | assert log_data['size'] == cst.SIZE + appended_log_length 140 | assert log_data['position'] == cst.SIZE 141 | cst.check_demo_data(log_data) # Previous parsed result is not affected by short appended log 142 | 143 | # Mismatching version 144 | log_data['logparser_version'] = '0.0.0' 145 | cst.write_text(cst.LOG_JSON_PATH, cst.json_dumps(log_data)) 146 | log_data = cst.read_data(cst.LOG_JSON_PATH) 147 | assert log_data['logparser_version'] == '0.0.0' 148 | 149 | cst.write_text(cst.LOG_PATH, appended_log, append=True) 150 | now_size = cst.SIZE + appended_log_length * 2 151 | parser.main() 152 | assert os.path.getsize(cst.APPENDED_LOG_PATH) == now_size 153 | log_data = cst.read_data(cst.LOG_JSON_PATH) 154 | assert log_data['logparser_version'] == cst.LOGPARSER_VERSION 155 | assert log_data['size'] == now_size 156 | assert log_data['position'] == now_size 157 | cst.check_demo_data(log_data) 158 | 159 | # Broken json file 160 | cst.write_text(cst.LOG_JSON_PATH, appended_log, append=True) 161 | cst.write_text(cst.LOG_PATH, appended_log, append=True) 162 | now_size = cst.SIZE + appended_log_length * 3 163 | parser.main() 164 | assert os.path.getsize(cst.APPENDED_LOG_PATH) == now_size 165 | log_data = cst.read_data(cst.LOG_JSON_PATH) 166 | assert log_data['size'] == now_size 167 | assert log_data['position'] == now_size 168 | cst.check_demo_data(log_data) 169 | 170 | 171 | def test_actual_lines(psr): 172 | """ 173 | 2019-01-01 00:00:01 DEBUG 1 174 | a 175 | 176 | b 177 | 178 | 2019-01-01 00:00:01 DEBUG 2 179 | """ 180 | prefix = u'2019-01-01 00:00:01 DEBUG ' 181 | parser = psr(execute_main=False, log_head_lines=5, log_tail_lines=10) 182 | # In windows, '\r\n' is stored as: '\r\r\n' 183 | cst.write_text(cst.LOG_PATH, prefix + '1\na\n\nb\n\n') 184 | cst.write_text(cst.LOG_PATH, prefix + '2\n', append=True) 185 | parser.main() 186 | log_data = cst.read_data(cst.LOG_JSON_PATH) 187 | assert '1\na\n\nb\n\n' in log_data['head'] 188 | assert log_data['_head'] == log_data['head'] 189 | 190 | for i in range(3, 8): 191 | cst.write_text(cst.LOG_PATH, prefix + '%s\n' % i, append=True) 192 | parser.main() 193 | log_data = cst.read_data(cst.LOG_JSON_PATH) 194 | assert log_data['_head'] == 5 195 | for i in range(1, 8): 196 | if i <= 3: 197 | assert 'DEBUG %s' % i in log_data['head'] 198 | else: 199 | assert 'DEBUG %s' % i not in log_data['head'] 200 | head = log_data['head'] 201 | 202 | for i in range(8, 12): 203 | cst.write_text(cst.LOG_PATH, prefix + '%s\n' % i, append=True) 204 | parser.main() 205 | log_data = cst.read_data(cst.LOG_JSON_PATH) 206 | assert log_data['_head'] == 5 207 | assert log_data['head'] == head 208 | assert log_data['tail'].startswith('b\n\n') 209 | for i in range(2, 11): 210 | assert 'DEBUG %s' % i in log_data['tail'] 211 | 212 | 213 | def test_appended_log(psr): 214 | first_log_time = '2018-10-23 18:28:34' 215 | 216 | parser = psr(execute_main=False, log_head_lines=10, log_tail_lines=50) 217 | # 2018-10-23 18:28:35 [test] WARNING: warn 218 | front_head, front_tail = re.split(r'WARNING: warn[^i]', FRONT) 219 | # {'item': 2} 220 | # 2018-10-23 18:28:40 [..logstats] INFO: Crawled 3 pages (at 60 pages/min), scraped 2 items (at 60 items/min) 221 | front_mid, front_tail = front_tail.split("{'item': 2}") 222 | 223 | cst.write_text(cst.LOG_PATH, u'') 224 | # Test short appended log 225 | for idx, appended_log in enumerate([u'', u'2018-10-23 18:28:34 DEBUG\n', 226 | u'2018-10-23 18:28:34 INFO\n', u'test\n']): 227 | cst.write_text(cst.LOG_PATH, appended_log, append=True) 228 | parser.main() 229 | data = cst.read_data(cst.LOG_JSON_PATH) 230 | # Text to be ignored for next round: '2018-10-23 18:28:34 INFO\r\n' 231 | # appended log: 2018-10-23 18:28:34 DEBUG 232 | # "_head": "2018-10-23 18:28:34 DEBUG\n", 233 | # "head": "2018-10-23 18:28:34 DEBUG\n", 234 | # "tail": "2018-10-23 18:28:34 DEBUG\n", 235 | if idx >= 2: 236 | assert data['first_log_time'] == first_log_time 237 | assert data['_head'] 238 | else: 239 | assert data['first_log_time'] == cst.NA 240 | assert not data['_head'] 241 | assert data['finish_reason'] == cst.NA 242 | assert data['pages'] is None 243 | assert data['items'] is None 244 | 245 | cst.write_text(cst.LOG_PATH, front_head, append=True) 246 | parser.main() 247 | data = cst.read_data(cst.LOG_JSON_PATH) 248 | assert data['first_log_time'] == first_log_time 249 | assert data['latest_log_time'] == '2018-10-23 18:28:35' 250 | assert data['datas'] == [['2018-10-23 18:28:35', 0, 0, 0, 0]] 251 | assert data['pages'] == 0 252 | assert data['items'] == 0 253 | for k in cst.LATEST_MATCHES_RESULT_DICT.keys(): 254 | if k in ['scrapy_version', 'telnet_console', 'resuming_crawl', 'latest_stat']: 255 | assert data['latest_matches'][k] 256 | else: 257 | assert not data['latest_matches'][k] 258 | for k in cst.LOG_CATEGORIES_RESULT_DICT.keys(): 259 | assert data['log_categories'][k]['count'] == 0 260 | assert data['log_categories'][k]['details'] == [] 261 | assert data['shutdown_reason'] == cst.NA 262 | assert data['finish_reason'] == cst.NA 263 | assert '[scrapy.utils.log] INFO: Scrapy 1.5.1 started' in data['head'] 264 | assert '[scrapy.utils.log] INFO: Scrapy 1.5.1 started' in data['tail'] 265 | 266 | cst.write_text(cst.LOG_PATH, u'WARNING: warn\n' + front_mid, append=True) 267 | parser.main() 268 | data = cst.read_data(cst.LOG_JSON_PATH) 269 | assert data['first_log_time'] == first_log_time 270 | assert data['latest_log_time'] == '2018-10-23 18:28:39' 271 | assert (data['datas'][0] == ['2018-10-23 18:28:35', 0, 0, 0, 0] 272 | and ['2018-10-23 18:28:37', 1, 60, 0, 0] in data['datas'] 273 | and ['2018-10-23 18:28:38', 2, 60, 1, 60] in data['datas'] 274 | and data['datas'][-1] == ['2018-10-23 18:28:39', 2, 0, 1, 0] 275 | and len(data['datas']) == 5) 276 | assert data['pages'] == 2 277 | assert data['items'] == 1 278 | for k in cst.LATEST_MATCHES_RESULT_DICT.keys(): 279 | if k in ['telnet_username', 'telnet_password']: 280 | assert not data['latest_matches'][k] 281 | else: 282 | assert data['latest_matches'][k] 283 | assert data['latest_matches']['latest_item'] == "{'item': 1}" 284 | for k, (count, __) in cst.LOG_CATEGORIES_RESULT_DICT.items(): 285 | if k == 'error_logs': 286 | assert data['log_categories'][k]['count'] == 4 287 | elif k == 'retry_logs': 288 | assert data['log_categories'][k]['count'] == 0 289 | else: 290 | assert data['log_categories'][k]['count'] == count 291 | if k == 'retry_logs': 292 | assert data['log_categories'][k]['details'] == [] 293 | else: 294 | assert data['log_categories'][k]['details'] 295 | assert data['shutdown_reason'] == cst.NA 296 | assert data['finish_reason'] == cst.NA 297 | 298 | cst.write_text(cst.LOG_PATH, u"{'item': 2}" + front_tail, append=True) 299 | parser.main() 300 | data = cst.read_data(cst.LOG_JSON_PATH) 301 | assert data['first_log_time'] == first_log_time 302 | assert data['latest_log_time'] == '2018-10-23 18:29:41' 303 | assert (data['datas'][0] == ['2018-10-23 18:28:35', 0, 0, 0, 0] 304 | and data['datas'][-1] == ['2018-10-23 18:29:41', 3, 0, 2, 0] 305 | and len(data['datas']) == 67) 306 | assert data['pages'] == 3 307 | assert data['items'] == 2 308 | for k in cst.LATEST_MATCHES_RESULT_DICT.keys(): 309 | if k in ['telnet_username', 'telnet_password']: 310 | assert not data['latest_matches'][k] 311 | else: 312 | assert data['latest_matches'][k] 313 | for k, (count, __) in cst.LOG_CATEGORIES_RESULT_DICT.items(): 314 | assert data['log_categories'][k]['count'] == count 315 | assert data['log_categories'][k]['details'] 316 | assert data['shutdown_reason'] == cst.NA 317 | assert data['finish_reason'] == cst.NA 318 | 319 | # 'finish_reason': 'finished', 320 | # 'finish_time': datetime.datetime(2018, 10, 23, 10, 29, 41, 174719), 321 | end_head, end_tail = END.split("'finish_time'") 322 | cst.write_text(cst.LOG_PATH, end_head, append=True) 323 | parser.main() 324 | data = cst.read_data(cst.LOG_JSON_PATH) 325 | assert data['first_log_time'] == first_log_time 326 | assert data['latest_log_time'] == '2018-10-23 18:29:41' 327 | assert (data['datas'][0] == ['2018-10-23 18:28:35', 0, 0, 0, 0] 328 | and data['datas'][-1] == ['2018-10-23 18:29:41', 3, 0, 2, 0] 329 | and len(data['datas']) == 67) 330 | assert data['pages'] == 3 331 | assert data['items'] == 2 332 | for k in cst.LATEST_MATCHES_RESULT_DICT.keys(): 333 | if k in ['telnet_username', 'telnet_password']: 334 | assert not data['latest_matches'][k] 335 | else: 336 | assert data['latest_matches'][k] 337 | for k, (count, __) in cst.LOG_CATEGORIES_RESULT_DICT.items(): 338 | assert data['log_categories'][k]['count'] == count 339 | assert data['log_categories'][k]['details'] 340 | assert data['shutdown_reason'] == cst.NA 341 | assert data['finish_reason'] == cst.NA 342 | # 2018-10-23 18:29:41 [scrapy.extensions.feedexport] INFO: Stored jsonlines feed (2 items) in: file:/// 343 | # 2018-10-23 18:29:41 [scrapy.statscollectors] INFO: Dumping Scrapy stats: 344 | assert 'INFO: Stored jsonlines feed' in data['tail'] 345 | assert 'INFO: Dumping Scrapy stats:' not in data['tail'] 346 | 347 | cst.write_text(cst.LOG_PATH, u"'finish_time'" + end_tail, append=True) 348 | parser.main() 349 | data = cst.read_data(cst.LOG_JSON_PATH) 350 | assert data['first_log_time'] == first_log_time 351 | assert data['latest_log_time'] == '2018-10-23 18:29:42' 352 | assert (data['datas'][0] == ['2018-10-23 18:28:35', 0, 0, 0, 0] 353 | and data['datas'][-1] == ['2018-10-23 18:29:41', 3, 0, 2, 0] 354 | and len(data['datas']) == 67) 355 | assert data['pages'] == 3 356 | assert data['items'] == 2 357 | for k in cst.LATEST_MATCHES_RESULT_DICT.keys(): 358 | if k in ['telnet_username', 'telnet_password']: 359 | assert not data['latest_matches'][k] 360 | else: 361 | assert data['latest_matches'][k] 362 | for k, (count, __) in cst.LOG_CATEGORIES_RESULT_DICT.items(): 363 | assert data['log_categories'][k]['count'] == count 364 | assert data['log_categories'][k]['details'] 365 | assert data['shutdown_reason'] == cst.NA 366 | assert data['finish_reason'] == 'finished' 367 | # assert data['size'] == 15883 # != cst.SIZE 15862 '2018-10-23 18:28:34\n' \r\n => 15883 368 | # assert data['position'] == 15883 # != cst.SIZE 15862 369 | assert data['size'] == data['position'] 370 | assert '[scrapy.utils.log] INFO: Scrapy 1.5.1 started' in data['head'] 371 | assert '[scrapy.core.engine] INFO: Spider closed' not in data['head'] 372 | assert '[scrapy.utils.log] INFO: Scrapy 1.5.1 started' not in data['tail'] 373 | assert '[scrapy.core.engine] INFO: Spider closed' in data['tail'] 374 | -------------------------------------------------------------------------------- /tests/test_parse.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | from logparser import parse 5 | 6 | from tests.demo_log import (ERROR_404, SHUTDOWN, FRONT, END, 7 | TELNET_160_DEFAULT, TELNET_160_USERNAME, 8 | TELNET_160_PASSWORD, TELNET_160_USERNAME_PASSWORD, 9 | LATEST_SCRAPE_ITEM_ONE_LINE, LATEST_SCRAPE_ITEM_MULTIPLE_LINES, 10 | LATEST_SCRAPE_ITEM_MIXED, SCRAPY_FIELDSTATS) 11 | from tests.utils import cst 12 | 13 | 14 | # empty log 15 | def test_invalid_log(): 16 | for text in ["", ERROR_404]: 17 | data = parse(text) 18 | cst.json_dumps(data) 19 | if not text: 20 | assert not (data['head'] or data['tail']) 21 | else: 22 | assert '404 - No Such Resource' in data['head'] and '404 - No Such Resource' in data['tail'] 23 | 24 | assert set(data.keys()) == set(cst.PARSE_KEYS) 25 | for k in ['first_log_time', 'latest_log_time', 'runtime', 'shutdown_reason', 'finish_reason']: 26 | assert data[k] == cst.NA 27 | for k in ['first_log_timestamp', 'latest_log_timestamp', 'latest_crawl_timestamp', 'latest_scrape_timestamp']: 28 | assert data[k] == 0 29 | for k in ['pages', 'items']: 30 | assert data[k] is None 31 | # assert data['last_update_timestamp'] > 0 # 1546272001 32 | # assert len(data['last_update_time']) == 19 # "2019-01-01 00:00:01" 33 | assert cst.string_to_timestamp(data['last_update_time']) == data['last_update_timestamp'] 34 | assert data['datas'] == [] 35 | 36 | for v in data['latest_matches'].values(): 37 | assert v == '' 38 | assert set(data['latest_matches'].keys()) == set(cst.LATEST_MATCHES_RESULT_DICT.keys()) 39 | 40 | for v in data['log_categories'].values(): 41 | assert v == dict(count=0, details=[]) 42 | assert set(data['log_categories'].keys()) == set(cst.LOG_CATEGORIES_RESULT_DICT.keys()) 43 | 44 | 45 | def test_demo_log(): 46 | modified_logstats = FRONT.replace("Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min)", 47 | "Crawled 1 pages (at 2 pages/min), scraped 3 items (at 4 items/min)") 48 | for case, text in zip(['without_stats_dumped', 'whole_log', 'modified_logstats'], 49 | [FRONT, FRONT + END, modified_logstats + END]): 50 | data = parse(text, headlines=50, taillines=100) # 180 lines in total 51 | # cst.json_dumps(data) 52 | 53 | if case == 'without_stats_dumped': 54 | cst.check_demo_data(data, without_stats_dumped=True) 55 | elif case == 'modified_logstats': # to test update_data_with_crawler_stats() 56 | cst.check_demo_data(data, without_stats_dumped=False, modified_logstats=True) 57 | else: 58 | cst.check_demo_data(data, without_stats_dumped=False) 59 | 60 | 61 | def test_latest_item_unicode_escape(): 62 | text = (FRONT + END).replace("{'item': 2}", u"{u'Chinese \\u6c49\\u5b57': 2}") 63 | data = parse(text) 64 | assert data['latest_matches']['latest_item'] == u"{u'Chinese 汉字': 2}" 65 | 66 | 67 | def test_only_stats_dumped(): 68 | replaces = [ 69 | ("'downloader/response_status_count/302': 1,", 70 | "'downloader/response_status_count/302': 7,\n 'downloader/response_status_count/301': 8,"), 71 | ("'response_received_count': 3,", "'response_received_count': 30,"), 72 | ("'item_scraped_count': 2,", "'item_scraped_count': 20,"), 73 | ("'log_count/ERROR': 5,", "'log_count/ERROR': 4,"), 74 | ("'finish_reason': 'finished',", "'finish_reason': 'forceshutdown',") 75 | ] 76 | dict_count = dict( 77 | critical_logs=5, 78 | error_logs=4, 79 | warning_logs=3, 80 | redirect_logs=15, 81 | retry_logs=2, 82 | ignore_logs=1 83 | ) 84 | text = END 85 | for replace in replaces: 86 | text = text.replace(*replace) 87 | data = parse(text, headlines=50, taillines=50) 88 | # cst.json_dumps(data) 89 | assert data['first_log_time'] == '2018-10-23 18:29:41' 90 | assert data['latest_log_time'] == '2018-10-23 18:29:42' 91 | assert data['runtime'] == '0:00:01' 92 | assert data['datas'] == [] 93 | assert data['pages'] == 30 94 | assert data['items'] == 20 95 | for k, v in data['latest_matches'].items(): 96 | assert v == '' 97 | for k, v in dict_count.items(): 98 | assert data['log_categories'][k]['count'] == v 99 | assert data['log_categories'][k]['details'] == [] 100 | assert data['finish_reason'] == 'forceshutdown' 101 | 102 | 103 | # Received SIGTERM twice 104 | def test_shutdown_reason(): 105 | data = parse(SHUTDOWN) 106 | assert data['shutdown_reason'] == 'Received SIGTERM twice' 107 | assert data['finish_reason'] == 'shutdown' 108 | 109 | data = parse(SHUTDOWN.replace('twice', '')) 110 | assert data['shutdown_reason'] == 'Received SIGTERM' 111 | assert data['finish_reason'] == 'shutdown' 112 | 113 | 114 | def test_telnet_info(): 115 | data = parse(TELNET_160_DEFAULT) 116 | d = data['latest_matches'] 117 | assert d['scrapy_version'] == '1.6.0' 118 | assert d['telnet_console'] == '127.0.0.1:6024' 119 | assert d['telnet_username'] == '' 120 | assert d['telnet_password'] == '9d3a29f17ee1bf9a' 121 | 122 | data = parse(TELNET_160_USERNAME) 123 | d = data['latest_matches'] 124 | assert d['telnet_username'] == 'usr123' 125 | assert d['telnet_password'] == 'd24ad6be287d69b3' 126 | 127 | data = parse(TELNET_160_PASSWORD) 128 | d = data['latest_matches'] 129 | assert d['telnet_username'] == '' 130 | assert d['telnet_password'] == '456psw' 131 | 132 | data = parse(TELNET_160_USERNAME_PASSWORD) 133 | d = data['latest_matches'] 134 | assert d['telnet_username'] == 'usr123' 135 | assert d['telnet_password'] == '456psw' 136 | 137 | 138 | def test_latest_scrape_item(): 139 | data = parse(LATEST_SCRAPE_ITEM_ONE_LINE) 140 | d = data['latest_matches'] 141 | latest_scrape = '2019-01-01 00:00:01 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/get>' 142 | assert d['latest_scrape'] == latest_scrape 143 | assert d['latest_item'] == "{'item': 1}" 144 | 145 | data = parse(LATEST_SCRAPE_ITEM_MULTIPLE_LINES) 146 | d = data['latest_matches'] 147 | latest_scrape = '2019-01-01 00:00:02 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/get>' 148 | assert d['latest_scrape'] == latest_scrape 149 | assert json.loads(d['latest_item'].replace("'", '"')) == dict(item=2) 150 | 151 | data = parse(LATEST_SCRAPE_ITEM_MIXED) 152 | d = data['latest_matches'] 153 | latest_scrape = '2019-01-01 00:00:03 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/get>' 154 | assert d['latest_scrape'] == latest_scrape 155 | assert json.loads(d['latest_item'].replace("u'", "'").replace("'", '"')) == dict(item={u'Chinese 汉字': 3}) 156 | 157 | 158 | def test_scrapy_fieldstats(): 159 | data = parse(SCRAPY_FIELDSTATS) 160 | d = data['crawler_stats'] 161 | assert d['fields_coverage'] == {u'Chinese 汉字': '50%', 'author': {'a': 1, 'b': 2}} 162 | -------------------------------------------------------------------------------- /tests/test_settings.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import io 3 | import os 4 | from shutil import copy 5 | import time 6 | 7 | from tests.demo_log import END 8 | from tests.utils import cst 9 | 10 | 11 | # SCRAPYD_SERVER = '127.0.0.1:6800' 12 | def test_scrapyd_server(psr): 13 | default = '127.0.0.1:6800' 14 | json_url = 'http://%s/logs/%s/%s/%s.json' % (default, cst.PROJECT, cst.SPIDER, cst.JOB) 15 | psr() 16 | stats = cst.read_data(cst.STATS_JSON_PATH) 17 | assert stats['datas'][cst.PROJECT][cst.SPIDER][cst.JOB]['json_url'] == json_url 18 | 19 | localhost = 'localhost:6800' 20 | json_url = 'http://%s/logs/%s/%s/%s.json' % (localhost, cst.PROJECT, cst.SPIDER, cst.JOB) 21 | psr(scrapyd_server=localhost) 22 | stats = cst.read_data(cst.STATS_JSON_PATH) 23 | assert stats['datas'][cst.PROJECT][cst.SPIDER][cst.JOB]['json_url'] == json_url 24 | 25 | 26 | # SCRAPYD_LOGS_DIR = '' 27 | def test_scrapyd_logs_dir(psr): 28 | paths_fixed = [cst.LOGS_PATH, cst.LOG_PATH, cst.TXT_PATH, cst.GBK_LOG_PATH, cst.STATS_JSON_PATH] 29 | paths_generated = [cst.LOG_JSON_PATH, cst.TXT_JSON_PATH, cst.APPENDED_LOG_PATH, 30 | cst.DATAS_COMPLETE_JSON_PATH, cst.DATAS_SIMPLIFIED_JSON_PATH] 31 | paths_not_exist = [os.path.join(cst.LOGS_PATH, 'gbk.json'), 32 | os.path.join(cst.LOGS_PATH, cst.PROJECT, cst.SPIDER, 'gbk.json'), 33 | os.path.join(cst.LOGS_PATH, cst.PROJECT_TXT, cst.SPIDER_TXT, 'gbk.json')] 34 | 35 | parser = psr(execute_main=False) 36 | for path in paths_fixed: 37 | assert os.path.exists(path) 38 | for path in paths_generated + paths_not_exist: 39 | assert not os.path.exists(path) 40 | parser.main() 41 | for path in paths_fixed + paths_generated: 42 | assert os.path.exists(path) 43 | for path in paths_not_exist: 44 | assert not os.path.exists(path) 45 | 46 | 47 | # PARSE_ROUND_INTERVAL = 10 48 | def test_parse_round_interval(psr): 49 | # mtime = os.path.getmtime(cst.LOG_JSON_PATH) 50 | start_time = time.time() 51 | psr(parse_round_interval=1, exit_timeout=0.001) # parse for first time, exit 52 | parse_time = time.time() - start_time 53 | exit_timeout = parse_time * 3 # Ensure a sleep 54 | for interval in [exit_timeout + 5, (exit_timeout + 5) * 3]: 55 | start_time = time.time() 56 | # parse for first time, sleep interval, parse for second time, exit 57 | psr(parse_round_interval=interval, exit_timeout=exit_timeout) 58 | assert time.time() - start_time > interval 59 | 60 | 61 | # ENABLE_TELNET = True 62 | def test_disable_telnet(psr): 63 | parser = psr(execute_main=False, enable_telnet=True) 64 | assert parser.ENABLE_TELNET 65 | 66 | parser = psr(execute_main=False, enable_telnet=False) 67 | assert not parser.ENABLE_TELNET 68 | 69 | 70 | # LOG_ENCODING = 'utf-8' 71 | def test_log_encoding(psr): 72 | # TCP connection timed out: 10060: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。. 73 | psr() 74 | data = cst.read_data(cst.LOG_JSON_PATH) 75 | for detail in data['log_categories']['retry_logs']['details']: 76 | assert u'连接尝试失败' in detail 77 | 78 | psr(log_encoding='gbk') 79 | data = cst.read_data(cst.LOG_JSON_PATH) 80 | for detail in data['log_categories']['retry_logs']['details']: 81 | assert u'连接尝试失败' not in detail and 'TCP connection timed out: 10060:' in detail 82 | 83 | # 2018-10-23 18:28:33 [test] 3: test utf8: 测试中文 84 | parser = psr(execute_main=False, log_encoding=cst.LOG_ENCODING) 85 | copy(cst.GBK_LOG_PATH, cst.LOG_PATH) 86 | parser.main() 87 | data = cst.read_data(cst.LOG_JSON_PATH) 88 | assert '2018-10-23 18:28:33 [test] 3: test utf8:' in data['head'] and u'测试中文' not in data['head'] 89 | 90 | parser = psr(execute_main=False, log_encoding='gbk') 91 | copy(cst.GBK_LOG_PATH, cst.LOG_PATH) 92 | parser.main() 93 | data = cst.read_data(cst.LOG_JSON_PATH) 94 | assert '2018-10-23 18:28:33 [test] 3: test utf8:' in data['head'] and u'测试中文' in data['head'] 95 | 96 | 97 | # LOG_EXTENSIONS=['.log', '.txt'] 98 | def test_log_extensions(psr): 99 | if os.path.exists(cst.STATS_JSON_PATH): 100 | os.remove(cst.STATS_JSON_PATH) 101 | psr(log_extensions=[]) 102 | stats = cst.read_data(cst.STATS_JSON_PATH) 103 | assert stats['datas'] == {} 104 | 105 | psr(log_extensions=['.log']) 106 | stats = cst.read_data(cst.STATS_JSON_PATH) 107 | assert len(stats['datas']) == 1 and cst.JOB in stats['datas'][cst.PROJECT][cst.SPIDER] 108 | 109 | psr(log_extensions=['.txt']) 110 | stats = cst.read_data(cst.STATS_JSON_PATH) 111 | assert len(stats['datas']) == 1 and cst.JOB_TXT in stats['datas'][cst.PROJECT_TXT][cst.SPIDER_TXT] 112 | 113 | psr(log_extensions=cst.LOG_EXTENSIONS) 114 | stats = cst.read_data(cst.STATS_JSON_PATH) 115 | assert (len(stats['datas']) == 2 116 | and cst.JOB in stats['datas'][cst.PROJECT][cst.SPIDER] 117 | and cst.JOB_TXT in stats['datas'][cst.PROJECT_TXT][cst.SPIDER_TXT]) 118 | 119 | 120 | # LOG_HEAD_LINES = 100, LOG_TAIL_LINES = 200 121 | def test_log_headlines_taillines(psr): 122 | psr(log_head_lines=5, log_tail_lines=10) 123 | data = cst.read_data(cst.LOG_JSON_PATH) 124 | assert len(data['head'].split('\n')) == 5 125 | assert len(data['tail'].split('\n')) == 10 126 | 127 | 128 | # LOG_CATEGORIES_LIMIT = 10 129 | def test_log_categories_limit(psr): 130 | log_categories_limit = 3 131 | psr(log_categories_limit=log_categories_limit) 132 | data = cst.read_data(cst.LOG_JSON_PATH) 133 | cst.check_demo_data(data, log_categories_limit=log_categories_limit) 134 | 135 | 136 | # JOBS_TO_KEEP=100 137 | def test_jobs_to_keep(psr): 138 | parser = psr() 139 | datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH) 140 | datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH) 141 | for datas in [datas_full, datas_simplified]: 142 | assert set(datas.keys()) == {cst.LOG_PATH, cst.TXT_PATH} 143 | # delete a logfile 144 | os.remove(cst.TXT_PATH) 145 | parser.main() 146 | datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH) 147 | datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH) 148 | for datas in [datas_full, datas_simplified]: 149 | assert set(datas.keys()) == {cst.LOG_PATH, cst.TXT_PATH} 150 | # add a logfile 151 | copy(cst.LOG_PATH, cst.LOG_TEMP_PATH) 152 | parser.main() 153 | datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH) 154 | datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH) 155 | for datas in [datas_full, datas_simplified]: 156 | assert set(datas.keys()) == {cst.LOG_PATH, cst.TXT_PATH, cst.LOG_TEMP_PATH} 157 | 158 | parser = psr(jobs_to_keep=1) 159 | datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH) 160 | datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH) 161 | for datas in [datas_full, datas_simplified]: 162 | assert set(datas.keys()) == {cst.LOG_PATH, cst.TXT_PATH} 163 | # delete a logfile 164 | os.remove(cst.TXT_PATH) 165 | parser.main() 166 | datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH) 167 | assert set(datas_full.keys()) == {cst.LOG_PATH, cst.TXT_PATH} 168 | datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH) 169 | assert set(datas_simplified.keys()) == {cst.LOG_PATH} 170 | # add a logfile 171 | copy(cst.LOG_PATH, cst.LOG_TEMP_PATH) 172 | parser.main() 173 | datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH) 174 | datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH) 175 | for datas in [datas_full, datas_simplified]: 176 | assert set(datas.keys()) == {cst.LOG_PATH, cst.LOG_TEMP_PATH} 177 | 178 | 179 | # CHUNK_SIZE = 10 * 1000 * 1000 # 10 MB 180 | def test_chunk_size(psr): 181 | parser = psr(execute_main=False) 182 | os.remove(cst.TXT_PATH) 183 | assert not os.path.isdir(cst.TXT_PATH) 184 | parser.main() 185 | data = cst.read_data(cst.LOG_JSON_PATH) 186 | assert data['first_log_time'] == '2018-10-23 18:28:34' 187 | assert data['latest_log_time'] == '2018-10-23 18:29:42' 188 | cst.check_demo_data(data) 189 | assert os.path.getsize(cst.APPENDED_LOG_PATH) == cst.SIZE 190 | 191 | parser = psr(execute_main=False, chunk_size=10000) # 15,862 = 9924 + 5938, 15683 = 9938 + 5745 192 | os.remove(cst.TXT_PATH) 193 | assert not os.path.isdir(cst.TXT_PATH) 194 | parser.main() 195 | data = cst.read_data(cst.LOG_JSON_PATH) 196 | cst.json_dumps(data) 197 | assert data['first_log_time'] == '2018-10-23 18:28:34' 198 | assert data['latest_log_time'] == '2018-10-23 18:29:42' 199 | cst.check_demo_data(data) 200 | assert os.path.getsize(cst.APPENDED_LOG_PATH) == 5938 if len(os.linesep) == 2 else 5745 201 | 202 | 203 | # DELETE_EXISTING_JSON_FILES_AT_STARTUP = False 204 | # Executed in Parser.__init__() 205 | def test_delete_json_files(psr): 206 | psr() 207 | for path in [cst.LOG_JSON_PATH, cst.TXT_JSON_PATH]: 208 | assert os.path.exists(path) 209 | with io.open(cst.LOG_JSON_TEMP_PATH, 'w', encoding=cst.LOG_ENCODING) as f: 210 | f.write(u'') 211 | 212 | parser = psr(execute_main=False, reset_logs=False, delete_existing_json_files_at_startup=False) 213 | for path in [cst.LOG_JSON_PATH, cst.TXT_JSON_PATH, cst.LOG_JSON_TEMP_PATH]: 214 | assert os.path.exists(path) 215 | parser.main() 216 | for path in [cst.LOG_JSON_PATH, cst.TXT_JSON_PATH, cst.LOG_JSON_TEMP_PATH]: 217 | assert os.path.exists(path) 218 | 219 | parser = psr(execute_main=False, reset_logs=False, delete_existing_json_files_at_startup=True) 220 | for path in [cst.LOG_JSON_PATH, cst.TXT_JSON_PATH, cst.LOG_JSON_TEMP_PATH]: 221 | assert not os.path.exists(path) 222 | parser.main() 223 | for path in [cst.LOG_JSON_PATH, cst.TXT_JSON_PATH]: 224 | assert os.path.exists(path) 225 | assert not os.path.exists(cst.LOG_JSON_TEMP_PATH) 226 | 227 | 228 | # KEEP_DATA_IN_MEMORY = False 229 | def test_keep_data_in_memory(psr): 230 | datas_full_keys_set = set(cst.META_KEYS + cst.PARSE_KEYS + cst.FULL_EXTENDED_KEYS) 231 | datas_simplified_keys_set = set(cst.META_KEYS + cst.SIMPLIFIED_KEYS) 232 | 233 | parser = psr(keep_data_in_memory=True) 234 | datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH) 235 | for k in [cst.LOG_PATH, cst.TXT_PATH]: 236 | assert set(datas_full[k].keys()) == datas_full_keys_set 237 | datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH) 238 | for k in [cst.LOG_PATH, cst.TXT_PATH]: 239 | assert set(datas_simplified[k].keys()) == datas_full_keys_set 240 | # keys_redundant 241 | # DEBUG: Simplify demo_txt/test_txt/2019-01-01T00_00_02 in memory 242 | os.remove(cst.TXT_PATH) 243 | parser.main() 244 | datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH) 245 | for k in [cst.LOG_PATH, cst.TXT_PATH]: 246 | assert set(datas_full[k].keys()) == datas_full_keys_set 247 | datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH) 248 | assert set(datas_simplified[cst.LOG_PATH].keys()) == datas_full_keys_set 249 | assert set(datas_simplified[cst.TXT_PATH].keys()) == datas_simplified_keys_set 250 | 251 | parser = psr(keep_data_in_memory=False) 252 | datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH) 253 | for k in [cst.LOG_PATH, cst.TXT_PATH]: 254 | assert set(datas_full[k].keys()) == datas_full_keys_set 255 | datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH) 256 | for k in [cst.LOG_PATH, cst.TXT_PATH]: 257 | assert set(datas_simplified[k].keys()) == datas_simplified_keys_set 258 | # New round of parsing, old file with new size, test self.cst.read_data(), found invalid cst.LOG_JSON_PATH 259 | cst.write_text(cst.LOG_PATH, u'appended_log\n', append=True) 260 | parser.main() 261 | cst.write_text(cst.LOG_JSON_PATH, u'') 262 | cst.write_text(cst.LOG_PATH, END, append=True) 263 | parser.main() 264 | 265 | 266 | # VERBOSE = False 267 | def test_verbose(psr): 268 | psr(verbose=True) 269 | psr(verbose=False) 270 | -------------------------------------------------------------------------------- /tests/test_telnet.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import platform 4 | import re 5 | import time 6 | 7 | # Used in test_telnet_fail() 8 | from tests.demo_log import TELNET_151_NO_PORT, TELNET_151_PORT_16023, TELNET_160_PORT_16024 9 | from tests.utils import cst 10 | 11 | 12 | def test_telnet(psr): 13 | # https://docs.scrapy.org/en/latest/topics/telnetconsole.html 14 | parser = psr(execute_main=False) 15 | 16 | cwd = os.getcwd() 17 | print(cwd) 18 | os.chdir(cst.DEMO_PROJECT_PATH) 19 | print(os.getcwd()) 20 | 21 | # ['1.4.0', '1.5.0', '1.5.1', '1.5.2', '1.6.0', 'latest'] 22 | # scrapyd 1.4.3 requires scrapy>=2.0.0 23 | # py38 supports Scrapy 2.11.2; py39 supports Scrapy 2.12.0 24 | # Scrapy 2.12.0: Dropped support for Python 3.8, added support for Python 3.13 25 | # Scrapy release history: 2.10.1, 2.11.0, 2.11.1, 2.11.2, 2.12.0 26 | test_type_to_version = dict( 27 | latest='latest', 28 | no_telnet='2.11.0', 29 | account='2.11.1', 30 | oldest='2.10.1', 31 | ) 32 | try: 33 | if cst.PY313: 34 | # TODO: update version 35 | test_type_to_version.update(no_telnet='latest', account='latest', oldest='latest') 36 | for test_type, version in test_type_to_version.items(): 37 | cst.sub_process('pip uninstall -y scrapyd', block=True) 38 | cst.sub_process('pip uninstall -y scrapy', block=True) 39 | cst.sub_process('pip uninstall -y Twisted', block=True) 40 | 41 | if version == 'latest': 42 | pip_cmd = 'pip install --upgrade scrapy' 43 | else: 44 | pip_cmd = 'pip install scrapy==%s' % version 45 | 46 | log_file = os.path.join(cst.DEMO_PROJECT_LOG_FOLDER_PATH, 'scrapy_%s_%s.log' % (version, test_type)) 47 | scrapy_cmd = 'scrapy crawl example -s CLOSESPIDER_TIMEOUT=20 -s LOG_FILE=%s' % log_file 48 | if test_type == 'no_telnet': 49 | scrapy_cmd += ' -s TELNETCONSOLE_ENABLED=False' 50 | elif test_type == 'account': 51 | scrapy_cmd += ' -s TELNETCONSOLE_USERNAME=usr123 -s TELNETCONSOLE_PASSWORD=psw456' 52 | 53 | print('test_type:', test_type) 54 | print('version:', version) 55 | print('pip_cmd:', pip_cmd) 56 | print('scrapy_cmd:', scrapy_cmd) 57 | cst.sub_process(pip_cmd, block=True) 58 | proc = cst.sub_process(scrapy_cmd) 59 | 60 | time.sleep(10) 61 | if test_type == 'oldest': 62 | proc.kill() 63 | parser.main() 64 | 65 | if test_type != 'oldest': 66 | time.sleep(20) 67 | parser.main() 68 | 69 | log_data = cst.read_data(re.sub(r'.log$', '.json', log_file)) 70 | print('%s log_data: %s' % (test_type, log_data)) 71 | 72 | if version == 'latest': 73 | assert log_data['latest_matches']['scrapy_version'] >= '2.11.2' 74 | else: 75 | assert log_data['latest_matches']['scrapy_version'] == version 76 | assert log_data['log_categories']['critical_logs']['count'] == 0 77 | assert log_data['log_categories']['error_logs']['count'] == 0 78 | 79 | if test_type == 'no_telnet': 80 | assert not log_data['latest_matches']['telnet_console'] 81 | else: 82 | assert log_data['latest_matches']['telnet_console'] 83 | 84 | if test_type == 'no_telnet': 85 | assert not log_data['latest_matches']['telnet_username'] 86 | assert not log_data['latest_matches']['telnet_password'] 87 | elif test_type == 'account': 88 | assert log_data['latest_matches']['telnet_username'] == 'usr123' 89 | assert log_data['latest_matches']['telnet_password'] == 'psw456' 90 | else: 91 | assert not log_data['latest_matches']['telnet_username'] 92 | assert log_data['latest_matches']['telnet_password'] 93 | 94 | if test_type == 'oldest': 95 | assert log_data['finish_reason'] == 'N/A' 96 | assert not log_data['crawler_stats'] 97 | assert not log_data['crawler_engine'] 98 | else: 99 | assert log_data['finish_reason'] == 'closespider_timeout' 100 | assert log_data['crawler_stats'] 101 | assert log_data['crawler_stats']['source'] == 'log' 102 | if test_type == 'no_telnet': 103 | assert not log_data['crawler_engine'] 104 | else: 105 | assert log_data['crawler_engine'] 106 | assert log_data['crawler_engine']['source'] == 'telnet' 107 | except Exception as err: 108 | if cst.PY2: 109 | print("Found error in test and ignore it for PY2: %s" % err) 110 | else: 111 | raise err 112 | finally: 113 | os.chdir(cwd) 114 | 115 | 116 | def test_disable_telnet(psr): 117 | cwd = os.getcwd() 118 | print(cwd) 119 | os.chdir(cst.DEMO_PROJECT_PATH) 120 | print(os.getcwd()) 121 | 122 | last_update_timestamp = 0 123 | runtime = 0 124 | try: 125 | cst.sub_process('pip uninstall -y Twisted', block=True) 126 | version = None 127 | pip_cmd = 'pip install --upgrade scrapy' 128 | cst.sub_process(pip_cmd, block=True) 129 | 130 | for name in ['enable_telnet', 'disable_telnet']: 131 | enable_telnet = name == 'enable_telnet' 132 | parser = psr(execute_main=False, enable_telnet=enable_telnet) 133 | # To test MyTelnet.verify_log_file_path() 134 | if enable_telnet: 135 | for _name in ['6023', '6024']: 136 | _log_file = os.path.join(cst.DEMO_PROJECT_LOG_FOLDER_PATH, '%s.log' % _name) 137 | cst.write_text(_log_file, TELNET_151_PORT_16023.replace(':16023', ':%s' % _name)) 138 | 139 | log_file = os.path.join(cst.DEMO_PROJECT_LOG_FOLDER_PATH, '%s.log' % name) 140 | scrapy_cmd = 'scrapy crawl example -s CLOSESPIDER_TIMEOUT=60 -s LOG_FILE=%s' % log_file 141 | cst.sub_process(scrapy_cmd) 142 | 143 | time.sleep(10) 144 | print('parser.main 1') 145 | parser.main() 146 | if enable_telnet: 147 | log_data = cst.read_data(re.sub(r'.log$', '.json', log_file)) 148 | print('enable_telnet log_data: %s' % log_data) 149 | last_update_timestamp = log_data['crawler_stats']['last_update_timestamp'] 150 | assert last_update_timestamp 151 | runtime = log_data['crawler_engine']['time()-engine.start_time'] 152 | print(time.ctime(), 'runtime: %s' % runtime) 153 | assert runtime 154 | 155 | time.sleep(10) 156 | print('parser.main 2') 157 | parser.main() 158 | # Issue #4: Stats collected via telnet are not being updated periodically 159 | if enable_telnet: 160 | log_data = cst.read_data(re.sub(r'.log$', '.json', log_file)) 161 | print('enable_telnet log_data: %s' % log_data) 162 | assert log_data['crawler_stats']['last_update_timestamp'] > last_update_timestamp 163 | runtime_new = log_data['crawler_engine']['time()-engine.start_time'] 164 | print(time.ctime(), 'runtime_new: %s' % runtime_new) 165 | assert runtime_new > runtime 166 | 167 | time.sleep(50) 168 | print('parser.main 3') 169 | parser.main() 170 | log_data = cst.read_data(re.sub(r'.log$', '.json', log_file)) 171 | print('test_disable_telnet log_data: %s' % log_data) 172 | if version: 173 | assert log_data['latest_matches']['scrapy_version'] == version 174 | assert log_data['latest_matches']['telnet_console'] 175 | assert log_data['crawler_stats']['source'] == 'log' 176 | if enable_telnet: 177 | assert log_data['crawler_engine'] 178 | else: 179 | assert not log_data['crawler_engine'] 180 | except Exception as err: 181 | if cst.PY2: 182 | print("Found error in test and ignore it for PY2: %s" % err) 183 | else: 184 | raise err 185 | finally: 186 | os.chdir(cwd) 187 | 188 | 189 | def test_telnet_fail(psr): 190 | parser = psr(execute_main=False) 191 | for name in ['telnet_151_port_16023', 'telnet_160_port_16024', 'telnet_151_no_port']: 192 | log_file = os.path.join(cst.DEMO_PROJECT_LOG_FOLDER_PATH, '%s.log' % name) 193 | cst.write_text(log_file, globals()[name.upper()]) 194 | parser.main() 195 | log_data = cst.read_data(re.sub(r'.log$', '.json', log_file)) 196 | print('test_telnet_fail log_data: %s' % log_data) 197 | if name == 'telnet_151_port_16023': 198 | assert log_data['latest_matches']['scrapy_version'] == '1.5.1' 199 | assert log_data['latest_matches']['telnet_console'] == '127.0.0.1:16023' 200 | elif name == 'telnet_160_port_16024': 201 | assert log_data['latest_matches']['scrapy_version'] == '1.6.0' 202 | assert log_data['latest_matches']['telnet_console'] == '127.0.0.1:16024' 203 | else: 204 | assert log_data['latest_matches']['scrapy_version'] == '1.5.1' 205 | assert log_data['latest_matches']['telnet_console'] == 'localhost' 206 | assert not log_data['crawler_engine'] 207 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | 4 | from logparser.run import STAR 5 | from logparser.utils import check_update 6 | 7 | from tests.utils import cst 8 | 9 | 10 | def test_run_py(): 11 | assert STAR # Test importing of logparser/logparser/run.py 12 | 13 | 14 | def test_check_update(): 15 | js = check_update(timeout=60, to_ignore=True) 16 | print(js) 17 | if js: 18 | assert 'latest_version' in js and 'info' in js 19 | elif cst.PY313: 20 | assert js, js 21 | else: 22 | print('Got empty js.') 23 | 24 | 25 | def test_main_pid_exit(psr): 26 | psr(main_pid=os.getpid()) 27 | -------------------------------------------------------------------------------- /tests/test_z_cleantest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from tests.utils import cst 3 | 4 | 5 | def test_cleantest(psr): 6 | if cst.ON_WINDOWS and cst.PY2: 7 | cmd = 'pip install scrapy==1.5.1' 8 | else: 9 | cmd = 'pip install --upgrade scrapy' 10 | cst.sub_process(cmd, block=True) 11 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from datetime import datetime 3 | import io 4 | import json 5 | import os 6 | import platform 7 | from subprocess import Popen 8 | import sys 9 | import time 10 | 11 | from logparser import __version__ 12 | 13 | 14 | if len(os.linesep) == 2: 15 | SIZE = 15862 # In Windows, os.linesep is '\r\n' 16 | else: 17 | SIZE = 15862 - (180 - 1) # 180 lines in 2019-01-01T00_00_01.log 18 | 19 | 20 | class Constant(object): 21 | ON_WINDOWS = platform.system() == 'Windows' 22 | PY2 = sys.version_info.major < 3 23 | PY313 = sys.version_info.major == 3 and sys.version_info.minor == 13 24 | 25 | NA = 'N/A' 26 | LOGPARSER_VERSION = __version__ 27 | SIZE = SIZE 28 | STATUS = 'ok' 29 | SCRAPYD_SERVER = '127.0.0.1:6800' 30 | LOG_ENCODING = 'utf-8' 31 | LOG_EXTENSIONS = ['.log', '.txt'] 32 | LOG_HEAD_LINES = 50 33 | LOG_TAIL_LINES = 100 34 | 35 | PROJECT = 'demo' 36 | SPIDER = 'test' 37 | JOB = '2019-01-01T00_00_01' 38 | # JOB_KEY = '%s/%s/%s' % (PROJECT, SPIDER, JOB) 39 | # JOB_TEMP_KEY = JOB_KEY + '_temp' 40 | 41 | PROJECT_TXT = 'demo_txt' 42 | SPIDER_TXT = 'test_txt' 43 | JOB_TXT = '2019-01-01T00_00_02' 44 | # JOB_TXT_KEY = '%s/%s/%s' % (PROJECT_TXT, SPIDER_TXT, JOB_TXT) 45 | 46 | CWD = os.path.dirname(os.path.abspath(__file__)) 47 | LOGS_ZIP_PATH = os.path.join(CWD, 'logs.zip') 48 | LOGS_PATH = os.path.join(CWD, 'logs') 49 | LOG_PATH = os.path.join(LOGS_PATH, PROJECT, SPIDER, '%s.log' % JOB) 50 | LOG_TEMP_PATH = os.path.join(LOGS_PATH, PROJECT, SPIDER, '%s_temp.log' % JOB) 51 | TXT_PATH = os.path.join(LOGS_PATH, PROJECT_TXT, SPIDER_TXT, '%s.txt' % JOB_TXT) 52 | 53 | DEMO_PROJECT_PATH = os.path.join(CWD, 'demo_project') 54 | DEMO_PROJECT_LOG_FOLDER_PATH = os.path.join(LOGS_PATH, 'demo_project', 'example') 55 | 56 | LOG_JSON_PATH = os.path.join(LOGS_PATH, PROJECT, SPIDER, '%s.json' % JOB) 57 | LOG_JSON_TEMP_PATH = os.path.join(LOGS_PATH, PROJECT, SPIDER, '%s_temp.json' % JOB) 58 | TXT_JSON_PATH = os.path.join(LOGS_PATH, PROJECT_TXT, SPIDER_TXT, '%s.json' % JOB_TXT) 59 | 60 | GBK_LOG_PATH = os.path.join(LOGS_PATH, 'gbk.log') 61 | STATS_JSON_PATH = os.path.join(LOGS_PATH, 'stats.json') 62 | DATAS_COMPLETE_JSON_PATH = os.path.join(LOGS_PATH, 'datas_complete.json') 63 | DATAS_SIMPLIFIED_JSON_PATH = os.path.join(LOGS_PATH, 'datas_simplified.json') 64 | APPENDED_LOG_PATH = os.path.join(LOGS_PATH, 'appended_log.log') 65 | 66 | PARSE_KEYS = [ 67 | 'head', 68 | 'tail', 69 | 'first_log_time', 70 | 'latest_log_time', 71 | 'runtime', 72 | 'first_log_timestamp', 73 | 'latest_log_timestamp', 74 | 'datas', 75 | 'pages', 76 | 'items', 77 | 'latest_matches', 78 | 'latest_crawl_timestamp', 79 | 'latest_scrape_timestamp', 80 | 'log_categories', 81 | 'shutdown_reason', 82 | 'finish_reason', 83 | 'crawler_stats', 84 | 'last_update_time', 85 | 'last_update_timestamp', 86 | 'logparser_version' 87 | ] 88 | 89 | META_KEYS = [ 90 | 'log_path', 91 | 'json_path', 92 | 'json_url', 93 | 'size', 94 | 'position', 95 | 'status', 96 | '_head' 97 | ] 98 | 99 | FULL_EXTENDED_KEYS = [ 100 | 'crawler_engine', 101 | ] 102 | 103 | SIMPLIFIED_KEYS = [ 104 | 'pages', 105 | 'items', 106 | 'first_log_time', 107 | 'latest_log_time', 108 | 'runtime', 109 | 'shutdown_reason', 110 | 'finish_reason', 111 | 'last_update_time' 112 | ] 113 | 114 | LATEST_MATCHES_RESULT_DICT = dict( 115 | scrapy_version='1.5.1', 116 | telnet_console='127.0.0.1:6023', 117 | telnet_username='', 118 | telnet_password='', 119 | resuming_crawl='Resuming crawl', 120 | latest_offsite='Filtered offsite request to', 121 | latest_duplicate='Filtered duplicate request', 122 | latest_crawl='Crawled (', 123 | latest_scrape='Scraped from', 124 | latest_item="{'item': 2}", 125 | latest_stat='pages/min' 126 | ) 127 | 128 | LOG_CATEGORIES_RESULT_DICT = dict( 129 | critical_logs=(5, '] CRITICAL:'), 130 | error_logs=(5, '] ERROR:'), 131 | warning_logs=(3, '] WARNING:'), 132 | redirect_logs=(1, ': Redirecting ('), 133 | retry_logs=(2, 'etrying 50, 180 lines in total 257 | log_tail_lines=cst.LOG_TAIL_LINES, # 200 => 100 258 | log_categories_limit=10, # 10 259 | jobs_to_keep=100, 260 | chunk_size=10 * 1000 * 1000, # 10 MB 261 | delete_existing_json_files_at_startup=False, 262 | keep_data_in_memory=False, 263 | # verbose=True, 264 | verbose=False, 265 | main_pid=0, 266 | debug=True, # False 267 | exit_timeout=0.001 # 0 268 | ) 269 | --------------------------------------------------------------------------------