├── .gitignore ├── LICENSE.txt ├── MANIFEST.IN ├── README.md ├── conda.recipe ├── bld.bat ├── build.sh └── meta.yaml ├── nutch ├── README.md ├── __init__.py ├── crawl.py ├── nutch.py └── test_nutch.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.project 2 | /.pydevproject 3 | /nutch.egg-info 4 | /build 5 | /dist 6 | *.log 7 | /.settings/ 8 | /nutch/*.pyc 9 | /nutch/*/*.pyc 10 | .DS_Store 11 | /setup.cfg 12 | /.eggs/ 13 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /MANIFEST.IN: -------------------------------------------------------------------------------- 1 | include LICENSE.txt 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | nutch-python 2 | =========== 3 | A Python client library for the [Apache Nutch](http://nutch.apache.org/) 4 | that makes Nutch 1.x capabilities available using the 5 | [Nutch REST Server](https://wiki.apache.org/nutch/Nutch_1.X_RESTAPI). 6 | 7 | See (https://wiki.apache.org/nutch/NutchTutorial) for installing 8 | Nutch 1.x and alternatively operating it via the command line. 9 | 10 | This Python client library for Nutch is installable via Setuptools, 11 | Pip and Easy Install. 12 | 13 | Installation (with pip) 14 | ----------------------- 15 | 1. `pip install nutch` 16 | 17 | Installation (without pip) 18 | -------------------------- 19 | 1. `python setup.py build` 20 | 2. `python setup.py install` 21 | 22 | Wiki Documentation 23 | ================== 24 | See the [wiki](https://github.com/chrismattmann/nutch-python/wiki) for instructions on how to use Nutch-Python and 25 | its API. 26 | 27 | 28 | New Command Line Tool 29 | ============================ 30 | When you install Nutch-Python you also get a new command 31 | line client tool, `nutch-python` installed in your /path/to/python/bin 32 | directory. 33 | 34 | The options and help for the command line tool can be seen by typing 35 | `nutch-python` without any arguments. 36 | 37 | Questions, comments? 38 | =================== 39 | Send them to [Chris A. Mattmann](mailto:chris.a.mattmann@jpl.nasa.gov). 40 | 41 | Contributors 42 | ============ 43 | * Brian D. Wilson, JPL 44 | * Chris A. Mattmann, JPL 45 | * Aron Ahmadia, Continuum Analytics 46 | 47 | License 48 | ======= 49 | [Apache License, version 2](http://www.apache.org/licenses/LICENSE-2.0) 50 | -------------------------------------------------------------------------------- /conda.recipe/bld.bat: -------------------------------------------------------------------------------- 1 | "%PYTHON%" setup.py install 2 | if errorlevel 1 exit 1 3 | 4 | :: Add more build steps here, if they are necessary. 5 | 6 | :: See 7 | :: http://docs.continuum.io/conda/build.html 8 | :: for a list of environment variables that are set during the build process. 9 | -------------------------------------------------------------------------------- /conda.recipe/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | $PYTHON setup.py install 4 | 5 | # Add more build steps here, if they are necessary. 6 | 7 | # See 8 | # http://docs.continuum.io/conda/build.html 9 | # for a list of environment variables that are set during the build process. 10 | -------------------------------------------------------------------------------- /conda.recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: nutch-python 3 | version: "1.10.3" 4 | 5 | source: 6 | fn: nutch-1.10.3.tar.gz 7 | url: https://pypi.python.org/packages/source/n/nutch/nutch-1.10.3.tar.gz 8 | md5: e52e92aac4162d4f8ddb8d8bffa44bd3 9 | # patches: 10 | # List any patch files here 11 | # - fix.patch 12 | 13 | build: 14 | # noarch_python: True 15 | preserve_egg_dir: True 16 | entry_points: 17 | # Put any entry points (scripts to be generated automatically) here. The 18 | # syntax is module:function. For example 19 | # 20 | # - nutch = nutch:main 21 | # 22 | # Would create an entry point called nutch that calls nutch.main() 23 | 24 | - nutch-python = nutch.nutch:main 25 | 26 | # If this is a new build for the same version, increment the build 27 | # number. If you do not include this key, it defaults to 0. 28 | # number: 1 29 | 30 | requirements: 31 | build: 32 | - python 33 | - requests 34 | - setuptools 35 | - 'elasticnutch >=1.11' 36 | - pytest 37 | - pytest-runner 38 | 39 | run: 40 | - python 41 | - setuptools 42 | - requests 43 | - pytest 44 | - pytest-runner 45 | - 'elasticnutch >=1.11' 46 | 47 | test: 48 | # Python imports 49 | imports: 50 | - nutch 51 | 52 | commands: 53 | # You can put test commands to be run here. Use this to test that the 54 | # entry points work. 55 | 56 | - nutch-python --help 57 | 58 | # You can also put a file called run_test.py in the recipe that will be run 59 | # at test time. 60 | 61 | requires: 62 | - pytest 63 | # Put any additional test requirements here. For example 64 | # - nose 65 | 66 | about: 67 | home: http://github.com/chrismattmann/nutch-python 68 | license: Apache Software License 69 | summary: 'Apache Nutch Python library' 70 | 71 | # See 72 | # http://docs.continuum.io/conda/build.html for 73 | # more information about meta.yaml 74 | -------------------------------------------------------------------------------- /nutch/README.md: -------------------------------------------------------------------------------- 1 | Crawl.py Usage Guide and Examples 2 | ================================ 3 | 4 | ## 1. Create new config 5 | 6 | ``` 7 | $ ./crawl.py create conf -h 8 | usage: crawl.py create conf [-h] -cf CONF_FILE -id ID 9 | 10 | optional arguments: 11 | -h, --help show this help message and exit 12 | -cf CONF_FILE, --conf-file CONF_FILE 13 | Path to conf file, nutch-site.xml 14 | -id ID, --id ID Id for config 15 | 16 | ``` 17 | 18 | ### Example: 19 | 20 | `./crawl.py create conf -cf ../conf/nutch-site.xml -id 'conf3'` 21 | 22 | 23 | # 2. Run crawl for n rounds 24 | 25 | ``` 26 | $ ./crawl.py crawl -h 27 | usage: crawl.py crawl [-h] -ci CONF_ID -n NUM_ROUNDS 28 | 29 | optional arguments: 30 | -h, --help show this help message and exit 31 | -sf SEED_FILE, --seed-file SEED_FILE 32 | Seed file path (local path) 33 | -ci CONF_ID, --conf-id CONF_ID 34 | Config Identifier 35 | -n NUM_ROUNDS, --num-rounds NUM_ROUNDS 36 | Number of rounds/iterations 37 | 38 | $ ./crawl.py crawl seed -h 39 | usage: crawl.py crawl seed [-h] [-sf SEED_FILE] [-sl SEED_LIST] 40 | 41 | optional arguments: 42 | -h, --help show this help message and exit 43 | -sf SEED_FILE, --seed-file SEED_FILE 44 | Seed file path (local path) 45 | -sl SEED_LIST, --seed-list SEED_LIST 46 | Comma separated set of seeds to crawl 47 | ``` 48 | 49 | ### Example 50 | 51 | To run two rounds: 52 | 53 | `./crawl.py crawl seed --seed-file ../seed/urls.txt --conf-id conf3 -n 2` 54 | 55 | 56 | # 3. Specify Nutch server URL 57 | ``` 58 | $ ./crawl.py -h 59 | usage: crawl.py [-h] [-u URL] {create,crawl} ... 60 | 61 | Nutch Rest Client CLI 62 | 63 | positional arguments: 64 | {create,crawl} sub-commands 65 | create command for creating seed/config 66 | crawl Runs Crawl 67 | 68 | optional arguments: 69 | -h, --help show this help message and exit 70 | -u URL, --url URL Nutch Server URL 71 | ``` 72 | 73 | ## Example : 74 | 75 | `./crawl.py -u http://remotehost:8080/ crawl|create` 76 | 77 | # 4. Specify Seeds from the Command Line Arguments 78 | ``` 79 | $ ./crawl.py crawl seed -h 80 | usage: crawl.py crawl seed [-h] [-sf SEED_FILE] [-sl SEED_LIST] 81 | 82 | optional arguments: 83 | -h, --help show this help message and exit 84 | -sf SEED_FILE, --seed-file SEED_FILE 85 | Seed file path (local path) 86 | -sl SEED_LIST, --seed-list SEED_LIST 87 | Comma separated set of seeds to crawl 88 | ``` 89 | 90 | ## Example : 91 | ``` 92 | $ ./crawl.py crawl -ci default -n 1 seed -sl "http://www.google.com" 93 | ``` 94 | 95 | -------------------------------------------------------------------------------- /nutch/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .nutch import Nutch, NutchException, Job, Config 18 | -------------------------------------------------------------------------------- /nutch/crawl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # encoding: utf-8 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from __future__ import print_function 20 | from __future__ import division 21 | 22 | import sys 23 | import argparse 24 | import nutch 25 | 26 | #TODO: set this on when -verbose flag is requested in CLI args 27 | nutch.Verbose = False 28 | 29 | 30 | class Crawler(object): 31 | 32 | def __init__(self, args): 33 | self.args = args 34 | self.server_url = args['url'] if 'url' in args else nutch.DefaultServerEndpoint 35 | self.conf_id = args['conf_id'] if 'conf_id' in args else nutch.DefaultConfig 36 | self.proxy = nutch.Nutch(self.conf_id, self.server_url) 37 | 38 | def crawl_cmd(self, seed_list, n): 39 | ''' 40 | Runs the crawl job for n rounds 41 | :param seed_list: lines of seed URLs 42 | :param n: number of rounds 43 | :return: number of successful rounds 44 | ''' 45 | 46 | print("Num Rounds "+str(n)) 47 | 48 | cc = self.proxy.Crawl(seed=seed_list, rounds=n) 49 | rounds = cc.waitAll() 50 | print("Completed %d rounds" % len(rounds)) 51 | return len(rounds) 52 | 53 | def load_xml_conf(self, xml_file, id): 54 | ''' 55 | Creates a new config from xml file. 56 | :param xml_file: path to xml file. Format : nutch-site.xml or nutch-default.xml 57 | :param id: 58 | :return: config object 59 | ''' 60 | 61 | # converting nutch-site.xml to key:value pairs 62 | import xml.etree.ElementTree as ET 63 | tree = ET.parse(xml_file) 64 | params = {} 65 | for prop in tree.getroot().findall(".//property"): 66 | params[prop.find('./name').text.strip()] = prop.find('./value').text.strip() 67 | return self.proxy.Configs().create(id, configData=params) 68 | 69 | 70 | def create_cmd(self, args): 71 | ''' 72 | 'create' sub-command 73 | :param args: cli arguments 74 | :return: 75 | ''' 76 | cmd = args.get('cmd_create') 77 | if cmd == 'conf': 78 | conf_file = args['conf_file'] 79 | conf_id = args['id'] 80 | return self.load_xml_conf(conf_file, conf_id) 81 | else: 82 | print("Error: Create %s is invalid or not implemented" % cmd) 83 | 84 | 85 | def main(argv=sys.argv): 86 | parser = argparse.ArgumentParser(description="Nutch Rest Client CLI") 87 | 88 | subparsers = parser.add_subparsers(help ="sub-commands", dest="cmd") 89 | create_parser = subparsers.add_parser("create", help="command for creating seed/config") 90 | crawl_parser = subparsers.add_parser("crawl", help="Runs Crawl") 91 | 92 | create_subparsers = create_parser.add_subparsers(help ="sub-commands of 'create'", dest="cmd_create") 93 | conf_create_parser = create_subparsers.add_parser("conf", help="command for creating config") 94 | 95 | conf_create_parser.add_argument('-cf', '--conf-file', required=True, help='Path to conf file, nutch-site.xml') 96 | conf_create_parser.add_argument('-id', '--id', required=True, help='Id for config') 97 | 98 | crawl_subseeds = crawl_parser.add_subparsers(help = "sub-commands of 'seed'", dest="cmd_crawl") 99 | crawl_subseeds.required = True 100 | subseeds_crawl_parser = crawl_subseeds.add_parser("seed", help="command for creating seeds") 101 | subseeds_crawl_parser.add_argument("-sf", "--seed-file", help="Seed file path (local path)") 102 | subseeds_crawl_parser.add_argument("-sl", "--seed-list", help="Comma separated set of seeds to crawl") 103 | 104 | crawl_parser.add_argument("-ci", "--conf-id", help="Config Identifier", required=True) 105 | crawl_parser.add_argument('-n', '--num-rounds', required=True, type=int, help='Number of rounds/iterations') 106 | 107 | parser.add_argument('-u', '--url', help='Nutch Server URL', default=nutch.DefaultServerEndpoint) 108 | 109 | args = vars(parser.parse_args(argv)) 110 | 111 | res = None 112 | crawler = Crawler(args) 113 | if args['cmd'] == 'crawl': 114 | if args['seed_file'] != None: 115 | seed_file = args['seed_file'] 116 | with open(seed_file) as rdr: 117 | res = crawler.crawl_cmd(rdr.readlines(), args['num_rounds']) 118 | elif args['seed_list'] != None: 119 | seed_list = args['seed_list'] 120 | res = crawler.crawl_cmd(str(seed_list).rsplit(','), args['num_rounds']) 121 | elif args['cmd'] == 'create': 122 | res = crawler.create_cmd(args) 123 | else: 124 | print("Command is invalid or not implemented yet") 125 | exit(1) 126 | print(res) 127 | 128 | if __name__ == '__main__': 129 | main(sys.argv[1:]) 130 | print("==Done==") -------------------------------------------------------------------------------- /nutch/nutch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # encoding: utf-8 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from __future__ import print_function 20 | from __future__ import division 21 | 22 | USAGE = """ 23 | A simple python client for Nutch using the Nutch server REST API. 24 | Most commands return results in JSON format by default, or plain text. 25 | 26 | To control Nutch, please see wiki: 27 | https://github.com/chrismattmann/nutch-python/wiki#get-your-nutch-python-script-going 28 | 29 | To get/set the configuration of the Nutch server, use: 30 | -- nt.configGetList() # get list of named configurations 31 | -- nt.configGetInfo(id) # get parameters in named config. 32 | -- nt.configCreate(id, parameterDict) # create a new named config. 33 | 34 | To see the status of jobs, use: 35 | -- nt.jobGetList() # get list of running jobs 36 | -- nt.jobGetInfo(id) # get metadata for a job id 37 | -- nt.jobStop(id) # stop a job, DANGEROUS!!, may corrupt segment files 38 | 39 | """ 40 | 41 | import collections 42 | from datetime import datetime 43 | import getopt 44 | from getpass import getuser 45 | import requests 46 | import sys 47 | from time import sleep 48 | 49 | DefaultServerHost = "localhost" 50 | DefaultPort = "8081" 51 | DefaultServerEndpoint = 'http://' + DefaultServerHost + ':' + DefaultPort 52 | DefaultConfig = 'default' 53 | DefaultUserAgent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' 54 | 55 | LegalJobs = ['INJECT', 'GENERATE', 'FETCH', 'PARSE', 'UPDATEDB', 56 | 'CRAWL', 'DEDUP', 'INVERTLINKS', 'INDEX'] 57 | RequestVerbs = {'get': requests.get, 'put': requests.put, 'post': requests.post, 'delete': requests.delete} 58 | 59 | TextSendHeader = {'Content-Type': 'text/plain'} 60 | TextAcceptHeader = {'Accept': 'text/plain'} 61 | JsonAcceptHeader = {'Accept': 'application/json'} 62 | 63 | 64 | class NutchException(Exception): 65 | status_code = None 66 | 67 | 68 | class NutchCrawlException(NutchException): 69 | current_job = None 70 | completed_jobs = [] 71 | 72 | 73 | # TODO: Replace with Python logger 74 | Verbose = True 75 | 76 | 77 | def echo2(*s): 78 | sys.stderr.write('nutch.py: ' + ' '.join(map(str, s)) + '\n') 79 | 80 | 81 | def warn(*s): 82 | echo2('Warn:', *s) 83 | 84 | 85 | def die(*s): 86 | echo2('Error:', *s) 87 | echo2(USAGE) 88 | sys.exit() 89 | 90 | 91 | def defaultCrawlId(): 92 | """ 93 | Provide a reasonable default crawl name using the user name and date 94 | """ 95 | 96 | timestamp = datetime.now().isoformat().replace(':', '_') 97 | user = getuser() 98 | return '_'.join(('crawl', user, timestamp)) 99 | 100 | 101 | class Server: 102 | """ 103 | Implements basic interactions with a Nutch RESTful Server 104 | """ 105 | 106 | def __init__(self, serverEndpoint, raiseErrors=True): 107 | """ 108 | Create a Server object for low-level interactions with a Nutch RESTful Server 109 | 110 | :param serverEndpoint: URL of the server 111 | :param raiseErrors: Raise an exception for non-200 status codes 112 | 113 | """ 114 | self.serverEndpoint = serverEndpoint 115 | self.raiseErrors = raiseErrors 116 | 117 | def call(self, verb, servicePath, data=None, headers=None, forceText=False, sendJson=True): 118 | """Call the Nutch Server, do some error checking, and return the response. 119 | 120 | :param verb: One of nutch.RequestVerbs 121 | :param servicePath: path component of URL to append to endpoint, e.g. '/config' 122 | :param data: Data to attach to this request 123 | :param headers: headers to attach to this request, default are JsonAcceptHeader 124 | :param forceText: don't trust the response headers and just get the text 125 | :param sendJson: Whether to treat attached data as JSON or not 126 | """ 127 | 128 | default_data = {} if sendJson else "" 129 | data = data if data else default_data 130 | 131 | headers = headers if headers else JsonAcceptHeader.copy() 132 | 133 | if not sendJson: 134 | headers.update(TextSendHeader) 135 | 136 | if verb not in RequestVerbs: 137 | die('Server call verb must be one of %s' % str(RequestVerbs.keys())) 138 | if Verbose: 139 | echo2("%s Endpoint:" % verb.upper(), servicePath) 140 | echo2("%s Request data:" % verb.upper(), data) 141 | echo2("%s Request headers:" % verb.upper(), headers) 142 | verbFn = RequestVerbs[verb] 143 | 144 | if sendJson: 145 | resp = verbFn(self.serverEndpoint + servicePath, json=data, headers=headers) 146 | else: 147 | resp = verbFn(self.serverEndpoint + servicePath, data=data, headers=headers) 148 | 149 | if Verbose: 150 | echo2("Response headers:", resp.headers) 151 | echo2("Response status:", resp.status_code) 152 | if resp.status_code != 200: 153 | if self.raiseErrors: 154 | error = NutchException("Unexpected server response: %d" % resp.status_code) 155 | error.status_code = resp.status_code 156 | raise error 157 | else: 158 | warn('Nutch server returned status:', resp.status_code) 159 | if forceText or 'content-type' not in resp.headers or resp.headers['content-type'] == 'text/plain': 160 | if Verbose: 161 | echo2("Response text:", resp.text) 162 | return resp.text 163 | 164 | content_type = resp.headers['content-type'] 165 | if content_type == 'application/json' and not forceText: 166 | if Verbose: 167 | echo2("Response JSON:", resp.json()) 168 | return resp.json() 169 | else: 170 | die('Did not understand server response: %s' % resp.headers) 171 | 172 | defaultServer = lambda: Server(DefaultServerEndpoint) 173 | 174 | 175 | class IdEqualityMixin(object): 176 | """ 177 | Mix-in class to use self.id == other.id to check for equality 178 | """ 179 | def __eq__(self, other): 180 | return (isinstance(other, self.__class__) 181 | and self.id == other.id) 182 | 183 | def __ne__(self, other): 184 | return not self.__eq__(other) 185 | 186 | 187 | class Job(IdEqualityMixin): 188 | """ 189 | Representation of a running Nutch job, use JobClient to get a list of running jobs or to create one 190 | """ 191 | 192 | def __init__(self, jid, server): 193 | self.id = jid 194 | self.server = server 195 | 196 | def info(self): 197 | """Get current information about this job""" 198 | return self.server.call('get', '/job/' + self.id) 199 | 200 | def stop(self): 201 | return self.server.call('get', '/job/%s/stop' % self.id) 202 | 203 | def abort(self): 204 | return self.server.call('get', '/job/%s/abort' % self.id) 205 | 206 | 207 | class Config(IdEqualityMixin): 208 | """ 209 | Representation of an active Nutch configuration 210 | 211 | Use ConfigClient to get a list of configurations or create a new one 212 | """ 213 | 214 | def __init__(self, cid, server): 215 | self.id = cid 216 | self.server = server 217 | 218 | def __str__(self): 219 | return "Config(id:%s, ...)" %self.id 220 | 221 | def delete(self): 222 | return self.server.call('delete', '/config/' + self.id) 223 | 224 | def info(self): 225 | return self.server.call('get', '/config/' + self.id) 226 | 227 | def parameter(self, parameterId): 228 | return self.server.call('get', '/config/%s/%s' % (self.id, parameterId)) 229 | 230 | def __getitem__(self, item): 231 | """ 232 | Overload [] to provide get access to parameters 233 | :param item: the name of a parameter 234 | :return: the parameter if the name is valid, otherwise raise NutchException 235 | """ 236 | 237 | return self.server.call('get', '/config/%s/%s' % (self.id, item), forceText=True) 238 | 239 | def __setitem__(self, key, value): 240 | """ 241 | Overload [] to provide set access to configurations 242 | :param key: the name of the parameter to set 243 | :param value: the data associated with this parameter 244 | :return: the set value 245 | """ 246 | 247 | self.server.call('put', '/config/%s/%s' % (self.id, key), value, sendJson=False) 248 | return value 249 | 250 | 251 | class Seed(IdEqualityMixin): 252 | """ 253 | Representation of an active Nutch seed list 254 | 255 | Use SeedClient to get a list of seed lists or create a new one 256 | """ 257 | 258 | def __init__(self, sid, seedPath, server): 259 | self.id = sid 260 | self.seedPath = seedPath 261 | self.server = server 262 | 263 | 264 | class ConfigClient: 265 | def __init__(self, server): 266 | """Nutch Config client 267 | 268 | List named configurations, create new ones, or delete them with methods to get the list of named 269 | configurations, get parameters for a named configuration, get an individual parameter of a named 270 | configuration, create a new named configuration using a parameter dictionary, and delete a named configuration. 271 | """ 272 | self.server = server 273 | 274 | def list(self): 275 | configs = self.server.call('get', '/config') 276 | return [Config(cid, self.server) for cid in configs] 277 | 278 | def create(self, cid, configData): 279 | """ 280 | Create a new named (cid) configuration from a parameter dictionary (config_data). 281 | """ 282 | configArgs = {'configId': cid, 'params': configData, 'force': True} 283 | cid = self.server.call('post', "/config/create", configArgs, forceText=True, headers=TextAcceptHeader) 284 | new_config = Config(cid, self.server) 285 | return new_config 286 | 287 | def __getitem__(self, item): 288 | """ 289 | Overload [] to provide get access to configurations 290 | :param item: the name of a configuration 291 | :return: the Config object if the name is valid, otherwise raise KeyError 292 | """ 293 | 294 | # let's be optimistic... 295 | config = Config(item, self.server) 296 | if config.info(): 297 | return config 298 | 299 | # not found! 300 | raise KeyError 301 | 302 | def __setitem__(self, key, value): 303 | """ 304 | Overload [] to provide set access to configurations 305 | :param key: the name of the configuration to create 306 | :param value: the dict-like data associated with this configuration 307 | :return: the created Config object 308 | """ 309 | 310 | if not isinstance(value, collections.Mapping): 311 | raise TypeError(repr(value) + "is not a dict-like object") 312 | return self.create(key, value) 313 | 314 | 315 | class JobClient: 316 | def __init__(self, server, crawlId, confId, parameters=None): 317 | """ 318 | Nutch Job client with methods to list, create jobs. 319 | 320 | When the client is created, a crawlID and confID are associated. 321 | The client will automatically filter out jobs that do not match the associated crawlId or confId. 322 | :param server: 323 | :param crawlId: 324 | :param confId: 325 | :param parameters: 326 | :return: 327 | """ 328 | 329 | self.server = server 330 | self.crawlId = crawlId 331 | self.confId = confId 332 | self.parameters=parameters if parameters else {'args': dict()} 333 | 334 | def _job_owned(self, job): 335 | return job['crawlId'] == self.crawlId and job['confId'] == self.confId 336 | 337 | def list(self, allJobs=False): 338 | """ 339 | Return list of jobs at this endpoint. 340 | 341 | Call get(allJobs=True) to see all jobs, not just the ones managed by this Client 342 | """ 343 | 344 | jobs = self.server.call('get', '/job') 345 | 346 | return [Job(job['id'], self.server) for job in jobs if allJobs or self._job_owned(job)] 347 | 348 | def create(self, command, **args): 349 | """ 350 | Create a job given a command 351 | :param command: Nutch command, one of nutch.LegalJobs 352 | :param args: Additional arguments to pass to the job 353 | :return: The created Job 354 | """ 355 | 356 | command = command.upper() 357 | if command not in LegalJobs: 358 | warn('Nutch command must be one of: %s' % ', '.join(LegalJobs)) 359 | else: 360 | echo2('Starting %s job with args %s' % (command, str(args))) 361 | parameters = self.parameters.copy() 362 | parameters['type'] = command 363 | parameters['crawlId'] = self.crawlId 364 | parameters['confId'] = self.confId 365 | parameters['args'].update(args) 366 | 367 | job_info = self.server.call('post', "/job/create", parameters, JsonAcceptHeader) 368 | 369 | job = Job(job_info['id'], self.server) 370 | return job 371 | 372 | # some short-hand functions 373 | 374 | def inject(self, seed=None, urlDir=None, **args): 375 | """ 376 | :param seed: A Seed object (this or urlDir must be specified) 377 | :param urlDir: The directory on the server containing the seed list (this or urlDir must be specified) 378 | :param args: Extra arguments for the job 379 | :return: a created Job object 380 | """ 381 | 382 | if seed: 383 | if urlDir and urlDir != seed.seedPath: 384 | raise NutchException("Can't specify both seed and urlDir") 385 | urlDir = seed.seedPath 386 | elif urlDir: 387 | pass 388 | else: 389 | raise NutchException("Must specify seed or urlDir") 390 | args['url_dir'] = urlDir 391 | return self.create('INJECT', **args) 392 | 393 | def generate(self, **args): 394 | return self.create('GENERATE', **args) 395 | 396 | def fetch(self, **args): 397 | return self.create('FETCH', **args) 398 | 399 | def parse(self, **args): 400 | return self.create('PARSE', **args) 401 | 402 | def updatedb(self, **args): 403 | return self.create('UPDATEDB', **args) 404 | 405 | def stats(self): 406 | statsArgs = {'confId': self.confId, 'crawlId': self.crawlId, 'type': 'stats', 'args': {}} 407 | return self.server.call('post', '/db/crawldb', statsArgs) 408 | 409 | 410 | class SeedClient(): 411 | 412 | def __init__(self, server): 413 | """Nutch Seed client 414 | 415 | Client for uploading seed lists to Nutch 416 | """ 417 | self.server = server 418 | 419 | def create(self, sid, seedList): 420 | """ 421 | Create a new named (sid) Seed from a list of seed URLs 422 | 423 | :param sid: the name to assign to the new seed list 424 | :param seedList: the list of seeds to use 425 | :return: the created Seed object 426 | """ 427 | 428 | seedUrl = lambda uid, url: {"id": uid, "url": url} 429 | 430 | if not isinstance(seedList,tuple): 431 | seedList = (seedList,) 432 | 433 | seedListData = { 434 | "id": "12345", 435 | "name": sid, 436 | "seedUrls": [seedUrl(uid, url) for uid, url in enumerate(seedList)] 437 | } 438 | 439 | # As per resolution of https://issues.apache.org/jira/browse/NUTCH-2123 440 | seedPath = self.server.call('post', "/seed/create", seedListData, TextAcceptHeader) 441 | new_seed = Seed(sid, seedPath, self.server) 442 | return new_seed 443 | 444 | def createFromFile(self, sid, filename): 445 | """ 446 | Create a new named (sid) Seed from a file containing URLs 447 | It's assumed URLs are whitespace seperated. 448 | 449 | :param sid: the name to assign to the new seed list 450 | :param filename: the name of the file that contains URLs 451 | :return: the created Seed object 452 | """ 453 | 454 | urls = [] 455 | with open(filename) as f: 456 | for line in f: 457 | for url in line.split(): 458 | urls.append(url) 459 | 460 | return self.create(sid, tuple(urls)) 461 | 462 | class CrawlClient(): 463 | def __init__(self, server, seed, jobClient, rounds, index): 464 | """Nutch Crawl manager 465 | 466 | High-level Nutch client for managing crawls. 467 | 468 | When this client is initialized, the seedList will automatically be injected. 469 | There are four ways to proceed from here. 470 | 471 | progress() - checks the status of the current job, enqueue the next job if the current job is finished, 472 | and return immediately 473 | waitJob() - wait until the current job is finished and return 474 | waitRound() - wait and enqueue jobs until the current round is finished and return 475 | waitAll() - wait and enqueue jobs until all rounds are finished and return 476 | 477 | It is recommended to use progress() in a while loop for any applications that need to remain interactive. 478 | 479 | """ 480 | self.server = server 481 | self.jobClient = jobClient 482 | self.crawlId = jobClient.crawlId 483 | self.currentRound = 1 484 | self.totalRounds = rounds 485 | self.currentJob = None 486 | self.sleepTime = 1 487 | self.enable_index = index 488 | 489 | # dispatch injection 490 | self.currentJob = self.jobClient.inject(seed) 491 | 492 | def _nextJob(self, job, nextRound=True): 493 | """ 494 | Given a completed job, start the next job in the round, or return None 495 | 496 | :param nextRound: whether to start jobs from the next round if the current round is completed. 497 | :return: the newly started Job, or None if no job was started 498 | """ 499 | 500 | jobInfo = job.info() 501 | assert jobInfo['state'] == 'FINISHED' 502 | 503 | roundEnd = False 504 | if jobInfo['type'] == 'INJECT': 505 | nextCommand = 'GENERATE' 506 | elif jobInfo['type'] == 'GENERATE': 507 | nextCommand = 'FETCH' 508 | elif jobInfo['type'] == 'FETCH': 509 | nextCommand = 'PARSE' 510 | elif jobInfo['type'] == 'PARSE': 511 | nextCommand = 'UPDATEDB' 512 | elif jobInfo['type'] == 'UPDATEDB': 513 | nextCommand = 'INVERTLINKS' 514 | elif jobInfo['type'] == 'INVERTLINKS': 515 | nextCommand = 'DEDUP' 516 | elif jobInfo['type'] == 'DEDUP': 517 | if self.enable_index: 518 | nextCommand = 'INDEX' 519 | else: 520 | roundEnd = True 521 | elif jobInfo['type'] == 'INDEX': 522 | roundEnd = True 523 | else: 524 | raise NutchException("Unrecognized job type {}".format(jobInfo['type'])) 525 | 526 | if roundEnd: 527 | if nextRound and self.currentRound < self.totalRounds: 528 | nextCommand = 'GENERATE' 529 | self.currentRound += 1 530 | else: 531 | return None 532 | 533 | return self.jobClient.create(nextCommand) 534 | 535 | def progress(self, nextRound=True): 536 | """ 537 | Check the status of the current job, activate the next job if it's finished, and return the active job 538 | 539 | If the current job has failed, a NutchCrawlException will be raised with no jobs attached. 540 | 541 | :param nextRound: whether to start jobs from the next round if the current job/round is completed. 542 | :return: the currently running Job, or None if no jobs are running. 543 | """ 544 | 545 | currentJob = self.currentJob 546 | if currentJob is None: 547 | return currentJob 548 | 549 | jobInfo = currentJob.info() 550 | 551 | if jobInfo['state'] == 'RUNNING': 552 | return currentJob 553 | elif jobInfo['state'] == 'FINISHED': 554 | nextJob = self._nextJob(currentJob, nextRound) 555 | self.currentJob = nextJob 556 | return nextJob 557 | else: 558 | error = NutchCrawlException("Unexpected job state: {}".format(jobInfo['state'])) 559 | error.current_job = currentJob 560 | raise NutchCrawlException 561 | 562 | def addRounds(self, numRounds=1): 563 | """ 564 | Add more rounds to the crawl. This command does not start execution. 565 | 566 | :param numRounds: the number of rounds to add to the crawl 567 | :return: the total number of rounds scheduled for execution 568 | """ 569 | 570 | self.totalRounds += numRounds 571 | return self.totalRounds 572 | 573 | def nextRound(self): 574 | """ 575 | Execute all jobs in the current round and return when they have finished. 576 | 577 | If a job fails, a NutchCrawlException will be raised, with all completed jobs from this round attached 578 | to the exception. 579 | 580 | :return: a list of all completed Jobs 581 | """ 582 | 583 | finishedJobs = [] 584 | if self.currentJob is None: 585 | self.currentJob = self.jobClient.create('GENERATE') 586 | 587 | activeJob = self.progress(nextRound=False) 588 | while activeJob: 589 | oldJob = activeJob 590 | activeJob = self.progress(nextRound=False) # updates self.currentJob 591 | if oldJob and oldJob != activeJob: 592 | finishedJobs.append(oldJob) 593 | sleep(self.sleepTime) 594 | self.currentRound += 1 595 | return finishedJobs 596 | 597 | def waitAll(self): 598 | """ 599 | Execute all queued rounds and return when they have finished. 600 | 601 | If a job fails, a NutchCrawlException will be raised, with all completed jobs attached 602 | to the exception 603 | 604 | :return: a list of jobs completed for each round, organized by round (list-of-lists) 605 | """ 606 | 607 | finishedRounds = [self.nextRound()] 608 | 609 | while self.currentRound < self.totalRounds: 610 | finishedRounds.append(self.nextRound()) 611 | 612 | return finishedRounds 613 | 614 | 615 | class Nutch: 616 | def __init__(self, confId=DefaultConfig, serverEndpoint=DefaultServerEndpoint, raiseErrors=True, **args): 617 | ''' 618 | Nutch client for interacting with a Nutch instance over its REST API. 619 | 620 | Constructor: 621 | 622 | nt = Nutch() 623 | 624 | Optional arguments: 625 | 626 | confID - The name of the default configuration file to use, by default: nutch.DefaultConfig 627 | serverEndpoint - The location of the Nutch server, by default: nutch.DefaultServerEndpoint 628 | raiseErrors - raise exceptions if server response is not 200 629 | 630 | Provides functions: 631 | server - getServerStatus, stopServer 632 | config - get and set parameters for this configuration 633 | job - get list of running jobs, get job metadata, stop/abort a job by id, and create a new job 634 | 635 | To start a crawl job, use: 636 | Crawl() - or use the methods inject, generate, fetch, parse, updatedb in that order. 637 | 638 | To run a crawl in one method, use: 639 | -- nt = Nutch() 640 | -- response, status = nt.crawl() 641 | 642 | Methods return a tuple of two items, the response content (JSON or text) and the response status. 643 | ''' 644 | 645 | self.confId = confId 646 | self.server = Server(serverEndpoint, raiseErrors) 647 | self.config = ConfigClient(self.server)[self.confId] 648 | self.job_parameters = dict() 649 | self.job_parameters['confId'] = confId 650 | self.job_parameters['args'] = args # additional config. args as a dictionary 651 | 652 | # if the configuration doesn't contain a user agent, set a default one. 653 | if 'http.agent.name' not in self.config.info(): 654 | self.config['http.agent.name'] = DefaultUserAgent 655 | 656 | def Jobs(self, crawlId=None): 657 | """ 658 | Create a JobClient for listing and creating jobs. 659 | The JobClient inherits the confId from the Nutch client. 660 | 661 | :param crawlId: crawlIds to use for this client. If not provided, will be generated 662 | by nutch.defaultCrawlId() 663 | :return: a JobClient 664 | """ 665 | crawlId = crawlId if crawlId else defaultCrawlId() 666 | return JobClient(self.server, crawlId, self.confId) 667 | 668 | def Config(self): 669 | return self.config 670 | 671 | def Configs(self): 672 | return ConfigClient(self.server) 673 | 674 | def Seeds(self): 675 | return SeedClient(self.server) 676 | 677 | def Crawl(self, seed, seedClient=None, jobClient=None, rounds=1, index=True): 678 | """ 679 | Launch a crawl using the given seed 680 | :param seed: Type (Seed or SeedList) - used for crawl 681 | :param seedClient: if a SeedList is given, the SeedClient to upload, if None a default will be created 682 | :param jobClient: the JobClient to be used, if None a default will be created 683 | :param rounds: the number of rounds in the crawl 684 | :return: a CrawlClient to monitor and control the crawl 685 | """ 686 | if seedClient is None: 687 | seedClient = self.Seeds() 688 | if jobClient is None: 689 | jobClient = self.Jobs() 690 | 691 | if type(seed) != Seed: 692 | seed = seedClient.create(jobClient.crawlId + '_seeds', seed) 693 | return CrawlClient(self.server, seed, jobClient, rounds, index) 694 | 695 | ## convenience functions 696 | ## TODO: Decide if any of these should be deprecated. 697 | def getServerStatus(self): 698 | return self.server.call('get', '/admin') 699 | 700 | def stopServer(self): 701 | return self.server.call('post', '/admin/stop', headers=TextAcceptHeader) 702 | 703 | def configGetList(self): 704 | return self.Configs().list() 705 | 706 | def configGetInfo(self, cid): 707 | return self.Configs()[cid].info() 708 | 709 | def configGetParameter(self, cid, parameterId): 710 | return self.Configs()[cid][parameterId] 711 | 712 | def configCreate(self, cid, config_data): 713 | return self.Configs().create(cid, config_data) 714 | 715 | 716 | def main(argv=None): 717 | """Run Nutch command using REST API.""" 718 | global Verbose, Mock 719 | if argv is None: 720 | argv = sys.argv 721 | 722 | if len(argv) < 5: die('Bad args') 723 | try: 724 | opts, argv = getopt.getopt(argv[1:], 'hs:p:mv', 725 | ['help', 'server=', 'port=', 'mock', 'verbose']) 726 | except getopt.GetoptError as err: 727 | # print help information and exit: 728 | print(err) # will print something like "option -a not recognized" 729 | die() 730 | 731 | serverEndpoint = DefaultServerEndpoint 732 | # TODO: Fix this 733 | for opt, val in opts: 734 | if opt in ('-h', '--help'): echo2(USAGE); sys.exit() 735 | elif opt in ('-s', '--server'): serverEndpoint = val 736 | elif opt in ('-p', '--port'): serverEndpoint = 'http://localhost:%s' % val 737 | elif opt in ('-m', '--mock'): Mock = 1 738 | elif opt in ('-v', '--verbose'): Verbose = 1 739 | else: die(USAGE) 740 | 741 | cmd = argv[0] 742 | crawlId = argv[1] 743 | confId = argv[2] 744 | urlDir = argv[3] 745 | args = {} 746 | if len(argv) > 4: args = eval(argv[4]) 747 | 748 | nt = Nutch(crawlId, confId, serverEndpoint, urlDir) 749 | nt.Jobs().create(cmd, **args) 750 | 751 | 752 | if __name__ == '__main__': 753 | resp = main(sys.argv) 754 | print(resp[0]) 755 | -------------------------------------------------------------------------------- /nutch/test_nutch.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Test Nutch API 18 | # Assumes a Nutch REST server is running on localhost 19 | # TODO: Package into Travis tests 20 | 21 | import nutch 22 | import pytest 23 | import glob 24 | from time import sleep 25 | 26 | slow = pytest.mark.slow 27 | 28 | def get_nutch(): 29 | return nutch.Nutch() 30 | 31 | 32 | def test_nutch_constructor(): 33 | nt = get_nutch() 34 | assert nt 35 | 36 | ## Configurations 37 | 38 | def get_config_client(): 39 | return get_nutch().Configs() 40 | 41 | def test_config_client_constructor(): 42 | cc = get_config_client() 43 | assert cc 44 | 45 | def test_config_access(): 46 | cc = get_config_client() 47 | default_config = cc['default'] 48 | # there has to be something smarter to check here 49 | assert default_config.info() 50 | 51 | def test_config_create(): 52 | cc = get_config_client() 53 | cc['defaultcopy'] = {} 54 | assert cc['defaultcopy'].info()["db.fetch.interval.max"] 55 | 56 | # I don't know how to get this working 57 | def test_config_copy(): 58 | cc = get_config_client() 59 | default_config = cc['default'] 60 | default_config_data = default_config.info() 61 | cc['defaultcopy'] = default_config_data 62 | assert cc['defaultcopy'].info()["db.fetch.interval.max"] 63 | 64 | ## Seed Lists 65 | 66 | # Fairly limited functionality for working with seed lists 67 | 68 | def get_seed_client(): 69 | return get_nutch().Seeds() 70 | 71 | def test_seed_client_constructor(): 72 | sc = get_seed_client() 73 | assert sc 74 | 75 | 76 | def get_seed(seed_urls=('http://aron.ahmadia.net', 'http://www.google.com')): 77 | sc = get_seed_client() 78 | return sc.create('test_seed', seed_urls) 79 | 80 | 81 | def test_seed_create(): 82 | seed_urls = ('http://aron.ahmadia.net', 'http://www.google.com') 83 | seed = get_seed(seed_urls) 84 | seed_path = seed.seedPath 85 | with open(glob.glob(seed_path + '/*.txt')[0]) as f: 86 | seed_data = f.read() 87 | assert seed_data.split() == list(seed_urls) 88 | 89 | ## Jobs 90 | 91 | def get_job_client(): 92 | return get_nutch().Jobs() 93 | 94 | def get_inject_job(jc=None): 95 | seed = get_seed() 96 | if jc is None: 97 | jc = get_job_client() 98 | return jc.inject(seed) 99 | 100 | def test_job_client_constructor(): 101 | jc = get_job_client() 102 | assert jc 103 | 104 | def test_job_start(): 105 | jc = get_job_client() 106 | old_jobs = jc.list() 107 | inject_job = get_inject_job(jc) 108 | updated_jobs = jc.list() 109 | assert(len(updated_jobs) == len(old_jobs) + 1) 110 | 111 | # awesome functionality for checking if this job is in a list of jobs 112 | assert(inject_job not in old_jobs) 113 | assert(inject_job in updated_jobs) 114 | 115 | 116 | def test_job_client_lists(): 117 | # the default constructor uses a timestamp to create unique crawlIds 118 | jc1 = get_job_client() 119 | jc2 = get_job_client() 120 | 121 | jc1_job = get_inject_job(jc1) 122 | 123 | # only jobs with the same crawlId are returned in the list() 124 | assert jc1_job in jc1.list() 125 | assert jc1_job not in jc2.list() 126 | 127 | # unless allJobs=True is passed to the list() function 128 | assert jc1_job in jc2.list(allJobs=True) 129 | 130 | 131 | def test_job_inject(): 132 | nt = get_nutch() 133 | inject_job = get_inject_job() 134 | job_info = inject_job.info() 135 | assert job_info['type'] == 'INJECT' 136 | assert job_info['msg'] == 'OK' 137 | # jobs have the same configuration as the Nutch instance 138 | assert(job_info['confId'] == nt.confId) 139 | 140 | def test_job_generate(): 141 | nt = get_nutch() 142 | # need to inject before generating... 143 | jc = get_job_client() 144 | inject = get_inject_job(jc) 145 | # wait until injection is done 146 | 147 | for wait in range(10): 148 | if inject.info()['state'] != 'FINISHED': 149 | sleep(1) 150 | continue 151 | else: 152 | break 153 | else: 154 | raise Exception("took too long to inject") 155 | 156 | assert inject.info()['state'] == 'FINISHED' 157 | 158 | generate = jc.generate() 159 | job_info = generate.info() 160 | assert job_info['type'] == 'GENERATE' 161 | assert job_info['msg'] == 'OK' 162 | # jobs have the same configuration as the Nutch instance 163 | assert(job_info['confId'] == nt.confId) 164 | 165 | 166 | def test_job_stop(): 167 | inject_job = get_inject_job() 168 | inject_job.stop() 169 | # bad jobs will eventually enter the 'FAILED' state 170 | # is there a better test here? 171 | assert(inject_job.info()['state'] == 'STOPPING') 172 | 173 | 174 | def test_job_abort(): 175 | inject_job = get_inject_job() 176 | inject_job.abort() 177 | assert(inject_job.info()['state'] == 'KILLED') 178 | # How do we delete jobs using the REST API? Is it even possible? 179 | 180 | def get_crawl_client(): 181 | seed = get_seed() 182 | return get_nutch().Crawl(seed, index=False) 183 | 184 | # TODO: refactor injection job so we can test stats after it completes 185 | 186 | # TODO: refactor injection job so we can test stats after it completes 187 | 188 | @slow 189 | def test_crawl_client(): 190 | cc = get_crawl_client() 191 | assert cc.currentJob.info()['type'] == 'INJECT' 192 | rounds = cc.waitAll() 193 | assert len(rounds) == 1 194 | assert cc.currentJob is None 195 | jobs = rounds[0] 196 | # check crawl info 197 | assert(type(cc.jobClient.stats()['status']) == dict) 198 | assert all([j.info()['state'] == 'FINISHED' for j in jobs]) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | # $Id$ 20 | 21 | import os.path 22 | 23 | try: 24 | from ez_setup import use_setuptools 25 | use_setuptools() 26 | except ImportError: 27 | pass 28 | 29 | try: 30 | from setuptools import setup, find_packages 31 | except ImportError: 32 | from distutils.core import setup, find_packages 33 | 34 | version = '1.10.4' 35 | 36 | _descr = u'''********** 37 | nutch 38 | *************** 39 | 40 | .. contents:: 41 | Nutch python pure REST based library. 42 | ''' 43 | _keywords = 'nutch search engine crawler hadoop apache' 44 | _classifiers = [ 45 | 'Development Status :: 3 - Alpha', 46 | 'Environment :: Console', 47 | 'Intended Audience :: Developers', 48 | 'Intended Audience :: Information Technology', 49 | 'Intended Audience :: Science/Research', 50 | 'License :: OSI Approved :: Apache Software License', 51 | 'Operating System :: OS Independent', 52 | 'Programming Language :: Python', 53 | 'Topic :: Database :: Front-Ends', 54 | 'Topic :: Scientific/Engineering', 55 | 'Topic :: Software Development :: Libraries :: Python Modules', 56 | ] 57 | 58 | def read(*rnames): 59 | return open(os.path.join(os.path.dirname(__file__), *rnames)).read() 60 | 61 | long_description = _descr 62 | 63 | setup( 64 | name='nutch', 65 | version=version, 66 | description='Apache Nutch Python library', 67 | long_description=long_description, 68 | classifiers=_classifiers, 69 | keywords=_keywords, 70 | author='Chris Mattmann', 71 | author_email='chris.a.mttmnn@nasa.gov', 72 | url='http://github.com/chrismattmann/nutch-python', 73 | download_url='http://github.com/chrismattmann/nutch-python', 74 | license=read('LICENSE.txt'), 75 | packages=find_packages(exclude=['ez_setup']), 76 | include_package_data=True, 77 | zip_safe=True, 78 | setup_requires=[ 79 | 'pytest-runner', 80 | ], 81 | tests_require=[ 82 | 'pytest', 83 | ], 84 | entry_points={ 85 | 'console_scripts': [ 86 | 'nutch-python = nutch.nutch:main' 87 | ], 88 | }, 89 | package_data = { 90 | # And include any *.conf files found in the 'conf' subdirectory 91 | # for the package 92 | }, 93 | install_requires=[ 94 | 'setuptools', 95 | 'requests' 96 | ] 97 | ) 98 | --------------------------------------------------------------------------------